# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics
from geopy.distance import geodesic

from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load & Understand Data

In [2]:

# Replace the placeholder with your public Google Drive link
google_drive_link = 'https://drive.google.com/file/d/18JRfeCRuRNr7kfWKsfOmB_n3Th_i8_7Z/view?usp=sharing'

# Download the file using gdown
# You might need to adjust the output path and filename
output_path = 'downloaded_file_name'
!gdown --id {google_drive_link.split("/")[-2]} -O {output_path}

df_train = pd.read_csv(output_path)
display(df_train.head())

Downloading...
From: https://drive.google.com/uc?id=18JRfeCRuRNr7kfWKsfOmB_n3Th_i8_7Z
To: /content/downloaded_file_name
100% 7.77M/7.77M [00:00<00:00, 39.4MB/s]


Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [3]:
print("Train Dataset :", df_train.shape)

Train Dataset : (45593, 20)


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          45593 non-null  object 
 3   Delivery_person_Ratings      45593 non-null  object 
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  45593 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weatherconditions            45593 non-null  object 
 12  Road_traffic_density         45593 non-null  object 
 13  Vehicle_conditio

In [5]:
#Check statistical values for fields with numerical datatype
df_train.describe()

Unnamed: 0,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Vehicle_condition
count,45593.0,45593.0,45593.0,45593.0,45593.0
mean,17.017729,70.231332,17.465186,70.845702,1.023359
std,8.185109,22.883647,7.335122,21.118812,0.839065
min,-30.905562,-88.366217,0.01,0.01,0.0
25%,12.933284,73.17,12.988453,73.28,0.0
50%,18.546947,75.898497,18.633934,76.002574,1.0
75%,22.728163,78.044095,22.785049,78.107044,2.0
max,30.914057,88.433452,31.054057,88.563452,3.0


# Data Cleaning

In [6]:
#Update Column Names
def update_column_name(df):
    #Renaming Weatherconditions column
    df.rename(columns={'Weatherconditions': 'Weather_conditions'},inplace=True)

update_column_name(df_train)
print(df_train.columns)

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken(min)'],
      dtype='object')


In [7]:
#Extract relevant values from column
def extract_column_value(df):
    #Extract time and convert to int
    # df['Time_taken(min)'] = df['Time_taken(min)'].apply(lambda x: int(x.split(' ')[1].strip())) # This line is removed
    #Extract Weather conditions
    df['Weather_conditions'] = df['Weather_conditions'].apply(lambda x: x.split(' ')[1].strip() if ' ' in x else x)
    #Extract city code from Delivery person ID
    df['City_code']=df['Delivery_person_ID'].str.split("RES", expand=True)[0]

extract_column_value(df_train)
df_train[['Time_taken(min)','Weather_conditions','City_code']].head()

Unnamed: 0,Time_taken(min),Weather_conditions,City_code
0,(min) 24,Sunny,INDO
1,(min) 33,Stormy,BANG
2,(min) 26,Sandstorms,BANG
3,(min) 21,Sunny,COIMB
4,(min) 30,Cloudy,CHEN


In [8]:
#Drop Columns which won't be use for building model
def drop_columns(df):
    cols_to_drop = ['ID','Delivery_person_ID']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    if existing_cols:
      df.drop(existing_cols,axis=1,inplace=True)

print("Before No. of columns: ",df_train.shape[1])
drop_columns(df_train)
print("After No. of columns: ",df_train.shape[1])

Before No. of columns:  21
After No. of columns:  19


In [9]:
#Update datatypes
def update_datatype(df):
    df['Delivery_person_Age'] = df['Delivery_person_Age'].astype('float64')
    df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].astype('float64')
    df['multiple_deliveries'] = df['multiple_deliveries'].astype('float64')
    df['Order_Date']=pd.to_datetime(df['Order_Date'],format="%d-%m-%Y")

update_datatype(df_train)

In [10]:
#Convert String 'NaN' to np.nan
def convert_nan(df):
    df.replace('NaN', float(np.nan), regex=True,inplace=True)

convert_nan(df_train)

In [11]:
#Check null values
df_train.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
Delivery_person_Ratings,1908
Delivery_person_Age,1854
Time_Orderd,1731
City,1200
multiple_deliveries,993
Weather_conditions,616
Road_traffic_density,601
Festival,228
Restaurant_latitude,0
Order_Date,0


In [12]:
#Handle null values
def handle_null_values(df):
    df['Delivery_person_Age'].fillna(np.random.choice(df['Delivery_person_Age']), inplace=True)
    df['Weather_conditions'].fillna(np.random.choice(df['Weather_conditions']), inplace=True)
    df['City'].fillna(df['City'].mode()[0], inplace=True)
    df['Festival'].fillna(df['Festival'].mode()[0], inplace=True)
    df['multiple_deliveries'].fillna(df['multiple_deliveries'].mode()[0], inplace=True)
    df['Road_traffic_density'].fillna(df['Road_traffic_density'].mode()[0], inplace=True)
    df['Delivery_person_Ratings'].fillna(df['Delivery_person_Ratings'].median(), inplace=True)

handle_null_values(df_train)
df_train.isnull().sum()

Unnamed: 0,0
Delivery_person_Age,0
Delivery_person_Ratings,0
Restaurant_latitude,0
Restaurant_longitude,0
Delivery_location_latitude,0
Delivery_location_longitude,0
Order_Date,0
Time_Orderd,1731
Time_Order_picked,0
Weather_conditions,0


# Data Preprocessing

## Label Encoding

In [13]:
def label_encoding(df):
    categorical_columns = df.select_dtypes(include='object').columns
    label_encoder = LabelEncoder()
    df[categorical_columns] = df[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

label_encoding(df_train)
df_train.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min),City_code
0,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,38,46,4,0,2,3,2,0.0,0,2,14,10
1,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,129,143,3,1,2,3,3,1.0,0,0,23,3
2,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,5,10,2,2,0,1,2,1.0,0,2,16,3
3,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,110,123,4,3,0,0,2,1.0,0,0,11,6
4,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,60,70,0,0,1,3,3,1.0,0,0,20,5


# Train Test Split

In [14]:
#Split features & label
X = df_train.drop(['Time_taken(min)', 'Order_Date'], axis=1)  # Features, dropping Order_Date
y = df_train['Time_taken(min)']  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(36474, 17)
(36474,)
(9119, 17)
(9119,)


# Cross Validation

In [15]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),
]

param_grid = [
    {},
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [10]},
    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.44202616350851304

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.6374315747045728

RandomForestRegressor:
Best parameters: {'n_estimators': 10}
Best R2 score: 0.7104327489240265

XGBRegressor:
Best parameters: {'max_depth': 9, 'n_estimators': 30}
Best R2 score: 0.7640814423561096



# LightGBM (Light Gradient Boosting Machine)

In [16]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

lgb_model = lgb.LGBMRegressor(random_state=42)

# Train the model on the training data
# Ensure x_train and y_train are ready (no datetime columns, NaNs handled)
lgb_model.fit(X_train, y_train)

# Make predictions on the training and testing data
y_train_pred_lgb = lgb_model.predict(X_train)
y_test_pred_lgb = lgb_model.predict(X_test)

# Evaluate the model on the training set
train_mae_lgb = mean_absolute_error(y_train, y_train_pred_lgb)
train_mse_lgb = mean_squared_error(y_train, y_train_pred_lgb)
train_rmse_lgb = np.sqrt(train_mse_lgb)
train_r2_lgb = r2_score(y_train, y_train_pred_lgb)

print("--- LightGBM - Training Set Evaluation ---")
print(f"Mean Absolute Error (MAE): {train_mae_lgb:.2f}")
print(f"Mean Squared Error (MSE): {train_mse_lgb:.2f}")
print(f"Root Mean Squared Error (RMSE): {train_rmse_lgb:.2f}")
print(f"R-squared (R2): {train_r2_lgb:.2f}")

print("\n--- LightGBM - Testing Set Evaluation ---")
# Evaluate the model on the testing set
test_mae_lgb = mean_absolute_error(y_test, y_test_pred_lgb)
test_mse_lgb = mean_squared_error(y_test, y_test_pred_lgb)
test_rmse_lgb = np.sqrt(test_mse_lgb)
test_r2_lgb = r2_score(y_test, y_test_pred_lgb)

print(f"Mean Absolute Error (MAE): {test_mae_lgb:.2f}")
print(f"Mean Squared Error (MSE): {test_mse_lgb:.2f}")
print(f"Root Mean Squared Error (RMSE): {test_rmse_lgb:.2f}")
print(f"R-squared (R2): {test_r2_lgb:}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1493
[LightGBM] [Info] Number of data points in the train set: 36474, number of used features: 17
[LightGBM] [Info] Start training from score 16.301612
--- LightGBM - Training Set Evaluation ---
Mean Absolute Error (MAE): 3.48
Mean Squared Error (MSE): 19.32
Root Mean Squared Error (RMSE): 4.40
R-squared (R2): 0.78

--- LightGBM - Testing Set Evaluation ---
Mean Absolute Error (MAE): 3.61
Mean Squared Error (MSE): 20.79
Root Mean Squared Error (RMSE): 4.56
R-squared (R2): 0.7628297261931414


In [17]:
import pickle
xgb_model = xgb.XGBRegressor()
param_grid_xgb = {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]}
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='r2')
grid_search_xgb.fit(X_train, y_train)

# Save the GridSearchCV object (which includes the best model)
with open('grid_search_xgb.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb, f)

print("Saved the XGBoost GridSearchCV object to grid_search_xgb.pkl")

Saved the XGBoost GridSearchCV object to grid_search_xgb.pkl
