In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
%matplotlib inline

### Packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv("../data/model_data.csv", index_col= False)
data.head(5)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min),distance_rest_del_loc,hour,day,weekday_name,TypeOfMeal
0,30.0,4.7,conditions Sandstorms,Jam,0.0,Meal,motorcycle,1.0,No,Metropolitian,34.0,10.336614,19,20,Sunday,Dinner
1,30.0,4.7,conditions Sandstorms,Jam,0.0,Buffet,motorcycle,1.0,No,Metropolitian,46.0,5.849717,20,30,Saturday,Dinner
2,30.0,4.7,conditions Sunny,Low,0.0,Drinks,motorcycle,1.0,No,Metropolitian,19.0,0.974953,9,28,Monday,Breakfast
3,30.0,4.7,conditions Cloudy,Medium,1.0,Drinks,motorcycle,1.0,No,Metropolitian,42.0,7.799623,18,20,Sunday,Dinner
4,30.0,4.7,conditions Windy,Jam,0.0,Snack,motorcycle,1.0,No,Metropolitian,44.0,10.724479,19,29,Tuesday,Dinner


In [3]:
data.dtypes

Delivery_person_Age        float64
Delivery_person_Ratings    float64
Weatherconditions           object
Road_traffic_density        object
Vehicle_condition          float64
Type_of_order               object
Type_of_vehicle             object
multiple_deliveries        float64
Festival                    object
City                        object
Time_taken(min)            float64
distance_rest_del_loc      float64
hour                         int64
day                          int64
weekday_name                object
TypeOfMeal                  object
dtype: object

In [4]:
### Features and Predictors(Target)
X = data.drop(['Time_taken(min)'], axis =1)
y = data['Time_taken(min)']

In [5]:
### Splitting the data into Train validation and test
X_train, X_test,y_train ,  y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_Val,  X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.33, random_state = 42)
print(X_train.shape, y_train.shape)
print(X_Val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(30703, 15) (30703,)
(8816, 15) (8816,)
(4343, 15) (4343,)


In [6]:
### Feature Generation
### Convert the categorical features into numerical features

def convert_categorical_numerical(df, categorical_columns):

    encoder = OneHotEncoder(sparse_output = False)
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    one_hot_data = pd.DataFrame(one_hot_encoded, 
                                columns = encoder.get_feature_names_out(categorical_columns)) 
    df_rest_index = df.reset_index(drop = True, inplace = False)   
    df_encoded = pd.concat([df_rest_index, one_hot_data], axis =1)
    df_encoded = df_encoded.drop(columns = categorical_columns, axis =1)
    return df_encoded



In [7]:
def feature_scaling_numeric_columns(df, numerical_columns):

    print(numerical_columns)
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [10]:
### Applying various ML models
def model_training(X_train, X_test, y_train, y_test, ml_model):
    # Train the model
    ml_model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = ml_model.predict(X_train)
    y_pred_test = ml_model.predict(X_test)

    # Evaluate the model
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = mean_squared_error(y_test, y_pred_test, squared= False) 

    print(f"Model: {ml_model.__class__.__name__}")
    print(f"Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}, R2 Score: {test_r2:.4f}")

    print('-' * 50)

    return ml_model, test_mse, test_r2, test_rmse

def data_preprocessing(data):
    numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns

    data_encoded = convert_categorical_numerical(data, categorical_features)
    data_scaled = feature_scaling_numeric_columns(data_encoded, numerical_features)
    return data_scaled


X_train_pre = data_preprocessing(X_train)
X_Val_pre = data_preprocessing(X_Val)



models = [
    LinearRegression(),
    RandomForestRegressor(n_estimators=100, random_state=42),
    XGBRegressor(n_estimators=100, random_state=42),
    LGBMRegressor(n_estimators=100, random_state=42)
]

# Train and evaluate each model
trained_models = {}
for model in models:
    trained_model, test_mse, r2,rmse = model_training(X_train_pre, X_Val_pre, y_train, y_val, model)
    trained_models[model.__class__.__name__] = {'model': trained_model, 'test_mse': test_mse, 'r2_score': r2, 'rmse': rmse}


Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'distance_rest_del_loc', 'hour', 'day'],
      dtype='object')
Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'distance_rest_del_loc', 'hour', 'day'],
      dtype='object')
Model: LinearRegression
Train MSE: 35.1338, Test MSE: 35.3619, R2 Score: 0.5958
--------------------------------------------------




Model: RandomForestRegressor
Train MSE: 2.0676, Test MSE: 15.1101, R2 Score: 0.8273
--------------------------------------------------




Model: XGBRegressor
Train MSE: 10.3564, Test MSE: 28.1362, R2 Score: 0.6784
--------------------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001539 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 30703, number of used features: 39
[LightGBM] [Info] Start training from score 26.333485
Model: LGBMRegressor
Train MSE: 13.1252, Test MSE: 25.7084, R2 Score: 0.7061
--------------------------------------------------


