<a href="https://colab.research.google.com/github/harshachourey2/TripFare-Taxi-Fare-Prediction/blob/main/02_Feature_Engineering_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#"""Feature Engineering Model"""

1. Load cleaned taxi dataset from Notebook-1
2. Advanced Feature Engineering
3. Encode categorical variables
4. Feature Selection (Correlation)
5. Train-Test Split
6. Build multiple Regression Models
7. Compare models using R2, MAE, MSE, RMSE


##Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [2]:
df = pd.read_csv("clean_taxi_data.csv")


In [3]:
df.head()


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_hour,pickup_day,is_weekend,is_night
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,-73.976746,40.765152,1,N,-74.004265,40.746128,...,0.5,0.5,2.05,0.0,0.3,12.35,0,Tuesday,0,1
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,-73.983482,40.767925,1,N,-74.005943,40.733166,...,0.5,0.5,3.05,0.0,0.3,15.35,0,Tuesday,0,1
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,2,-73.782021,40.64481,1,N,-73.974541,40.67577,...,0.5,0.5,8.0,0.0,0.3,63.8,0,Tuesday,0,1
3,2,2016-03-01 00:00:00,2016-03-01 00:00:00,3,-73.863419,40.769814,1,N,-73.96965,40.757767,...,0.0,0.5,3.78,5.54,0.3,41.62,0,Tuesday,0,1
4,2,2016-03-01 00:00:00,2016-03-01 00:00:00,5,-74.017197,40.705383,1,N,-73.978073,40.755787,...,1.0,0.5,5.06,0.0,0.3,30.36,0,Tuesday,0,1


In [4]:
df.shape


(101108, 22)

In [5]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

df['trip_duration_min'] = (
    (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])
    .dt.total_seconds() / 60
)


In [6]:
df = df[df['trip_duration_min'] > 0]


##Feature Engineering â€“ Trip Duration

In [7]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

df['trip_duration_min'] = (
    (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])
    .dt.total_seconds() / 60
)


In [8]:
df = df[df['trip_duration_min'] > 0]


In [9]:
drop_cols = [
    'tpep_pickup_datetime',
    'tpep_dropoff_datetime'
]

df = df.drop(columns=drop_cols, errors='ignore')


##Encoding Categorical Columns

In [10]:
cat_cols = df.select_dtypes(include='object').columns
cat_cols


Index(['store_and_fwd_flag', 'pickup_day'], dtype='object')

In [11]:
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])


##Feature Selection using Correlation

In [12]:
corr = df.corr()['total_amount'].sort_values(ascending=False)
corr


Unnamed: 0,total_amount
total_amount,1.0
fare_amount,0.982434
tip_amount,0.687325
tolls_amount,0.662598
RatecodeID,0.346048
trip_duration_min,0.149047
is_night,0.049105
pickup_day,0.041296
store_and_fwd_flag,0.020484
dropoff_longitude,0.007412


##Define Features (X) and Target (y)

In [13]:
X = df.drop('total_amount', axis=1)
y = df['total_amount']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#ðŸ¤– MODEL BUILDING (Regression)

##Create Evaluation Function

In [15]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return r2, mae, mse, rmse


In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}


In [17]:
results = []

for name, model in models.items():
    r2, mae, mse, rmse = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append([name, r2, mae, mse, rmse])


#Model Comparison Table

In [18]:
results_df = pd.DataFrame(
    results,
    columns=['Model', 'R2 Score', 'MAE', 'MSE', 'RMSE']
)

results_df.sort_values(by='R2 Score', ascending=False)


Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Linear Regression,1.0,1e-05,1.491134e-09,3.9e-05
1,Ridge Regression,1.0,0.000244,7.230265e-06,0.002689
2,Lasso Regression,0.999966,0.044829,0.004461652,0.066796
3,Random Forest,0.999683,0.020794,0.04160387,0.20397
4,Gradient Boosting,0.999358,0.162854,0.08417909,0.290136


In [19]:
best_model_name = results_df.sort_values(
    by='R2 Score', ascending=False
).iloc[0]['Model']

best_model_name


'Linear Regression'

In [20]:
best_model = models[best_model_name]
best_model.fit(X_train, y_train)


In [21]:
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    feature_importance


In [22]:
df.to_csv("model_ready_data.csv", index=False)


In [23]:
import pickle

pickle.dump(best_model, open("best_model_temp.pkl", "wb"))
