In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [3]:
# Load the dataset
df = pd.read_csv("./Food_Delivery_Times.csv")

In [4]:
print("Initial Info")
df.info()

Initial Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [5]:
# df['experience_per_km'] = df['Courier_Experience_yrs'] / (df['Distance_km'] + 1e-3)
# df['prep_time_ratio'] = df['Preparation_Time_min'] / (df['Distance_km'] + 1e-3)
# df['is_peak_hour'] = df['Time_of_Day'].isin(['Morning', 'Evening']).astype(int)

In [6]:
# Drop the Order_ID column
df = df.drop(columns=["Order_ID"])

# Separate features and target
X = df.drop("Delivery_Time_min", axis=1)
y = df["Delivery_Time_min"]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Weather                 970 non-null    object 
 2   Traffic_Level           970 non-null    object 
 3   Time_of_Day             970 non-null    object 
 4   Vehicle_Type            1000 non-null   object 
 5   Preparation_Time_min    1000 non-null   int64  
 6   Courier_Experience_yrs  970 non-null    float64
 7   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()

In [10]:
categorical_cols

['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type']

In [11]:
numerical_cols

['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs']

In [12]:
# Preprocessing for numerical data
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler()),
    ("feature_selection", SelectKBest(score_func=f_regression, k=3))
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [13]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [14]:
# Define models to evaluate
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42, verbosity=0)
}

In [15]:
# Evaluate each model and store metrics
metrics_list = []

for name, modelR in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", modelR)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')

    metrics_list.append({
        "Model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "CV_R2_Mean": cv_scores.mean(),
        "CV_R2_Std": cv_scores.std()
    })

metrics_df = pd.DataFrame(metrics_list)

In [16]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2,CV_R2_Mean,CV_R2_Std
0,RandomForest,6.84705,9.582953,0.795119,0.696034,0.052932
1,GradientBoosting,6.598749,9.389548,0.803306,0.71295,0.066977
2,XGBoost,8.047232,10.949979,0.732497,0.650938,0.027185


In [17]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2,CV_R2_Mean,CV_R2_Std
0,RandomForest,6.84705,9.582953,0.795119,0.696034,0.052932
1,GradientBoosting,6.598749,9.389548,0.803306,0.71295,0.066977
2,XGBoost,8.047232,10.949979,0.732497,0.650938,0.027185


In [18]:
# Identify the best model based on CV_R2_Mean
best_model_name = metrics_df.sort_values(by="CV_R2_Mean", ascending=False).iloc[0]["Model"]
best_model = models[best_model_name]

In [19]:
best_model

In [20]:
# Define hyperparameter grid for the best model
param_grid = {}
if best_model_name == "RandomForest":
    param_grid = {
        "regressor__n_estimators": [50,100, 200],
        "regressor__max_depth": [5, 10, 20]
    }
elif best_model_name == "GradientBoosting":
    param_grid = {
    "regressor__n_estimators": [50, 100, 150, 200, 300, 500],
    "regressor__learning_rate": [0.001, 0.01, 0.03, 0.05, 0.1, 0.2],
    "regressor__max_depth": [3, 4, 5, 6, 8, 10],
    "regressor__min_samples_split": [2, 5, 10],
    "regressor__min_samples_leaf": [1, 2, 4],
    "regressor__subsample": [0.6, 0.8, 1.0],
    "regressor__max_features": ['sqrt', 'log2', None]
}

elif best_model_name == "XGBoost":
    param_grid = {
        "regressor__n_estimators": [100, 200, 300, 500],
        "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
        "regressor__max_depth": [3, 5, 7, 10],
        "regressor__subsample": [0.6, 0.8, 1.0],
        "regressor__colsample_bytree": [0.6, 0.8, 1.0],
        "regressor__gamma": [0, 0.1, 0.3, 0.5],
        "regressor__reg_alpha": [0, 0.1, 1],
        "regressor__reg_lambda": [1, 1.5, 2]
    }


In [21]:
# Perform GridSearchCV
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", best_model)
])

In [None]:
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1,verbose=2)
# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

Fitting 5 folds for each of 17496 candidates, totalling 87480 fits


KeyboardInterrupt: 

In [30]:
from sklearn.model_selection import RandomizedSearchCV

# Run the randomized search with verbose output
grid_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,  # param_grid is fine here
    n_iter=50,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=3,
    error_score='raise'
)

# Fit the model and search for best parameters
grid_search.fit(X_train, y_train)

# Get best parameters
best_params = grid_search.best_params_
print("Best Parameters:\n", best_params)

# Get best cross-validated R² score
best_r2_score = grid_search.best_score_
print("Best CV R² Score:", best_r2_score)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters:
 {'regressor__subsample': 1.0, 'regressor__n_estimators': 150, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 3, 'regressor__learning_rate': 0.05}
Best CV R² Score: 0.7261568107076464


In [31]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [32]:
grid_search

In [33]:
# Predict using the best estimator
y_pred = grid_search.predict(X_test)

# Calculate R² on test set
test_r2 = r2_score(y_test, y_pred)
print("R² score on test set:", test_r2)

R² score on test set: 0.8080771674099227


In [34]:
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'regressor__subsample': 1.0, 'regressor__n_estimators': 150, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 3, 'regressor__learning_rate': 0.05}


In [35]:
print("Model Comparison Metrics:")
print(metrics_df)
print(f"\nBest Model: {best_model_name}")
print("Best Hyperparameters from GridSearchCV:")
print(best_params)
print(f"Best Cross-Validated R2 Score: {best_score:.4f}")



Model Comparison Metrics:
              Model       MAE       RMSE        R2  CV_R2_Mean  CV_R2_Std
0      RandomForest  6.847050   9.582953  0.795119    0.696034   0.052932
1  GradientBoosting  6.598749   9.389548  0.803306    0.712950   0.066977
2           XGBoost  8.047232  10.949979  0.732497    0.650938   0.027185

Best Model: GradientBoosting
Best Hyperparameters from GridSearchCV:
{'regressor__subsample': 1.0, 'regressor__n_estimators': 150, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 3, 'regressor__learning_rate': 0.05}
Best Cross-Validated R2 Score: 0.7262


In [36]:
import joblib

# Save the best model
joblib.dump(grid_search.best_estimator_, "best_gradient_boosting_model.pkl")

['best_gradient_boosting_model.pkl']

In [35]:
# Load the model
loaded_model = joblib.load("best_gradient_boosting_model.pkl")

In [36]:
loaded_model

In [37]:
X_test

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
521,5.30,Clear,Low,Evening,Bike,16,5.0
737,10.46,Clear,,Evening,Bike,25,3.0
740,4.04,Rainy,High,Evening,Bike,14,6.0
660,3.33,,Medium,Evening,Scooter,24,2.0
411,17.44,,Low,Night,Car,23,0.0
...,...,...,...,...,...,...,...
408,15.62,Rainy,Medium,Afternoon,Scooter,23,7.0
332,1.80,Clear,,Night,Bike,14,1.0
208,7.39,Rainy,Medium,Morning,Scooter,25,6.0
613,9.70,Snowy,Low,Evening,Bike,6,


In [38]:
# testingdata
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Define the number of dummy rows
num_rows = 20

# Define dummy data options
weather_options = ['Clear', 'Rainy', 'Cloudy', 'Foggy', 'Snowy']
traffic_options = ['Low', 'Medium', 'High']
time_of_day_options = ['Morning', 'Afternoon', 'Evening', 'Night']
vehicle_types = ['Bike', 'Car', 'Truck', 'Scooter']

# Generate dummy data
data = {
    'Distance_km': np.random.uniform(1, 1000, num_rows).round(2),
    'Weather': np.random.choice(weather_options, num_rows),
    'Traffic_Level': np.random.choice(traffic_options, num_rows),
    'Time_of_Day': np.random.choice(time_of_day_options, num_rows),
    'Vehicle_Type': np.random.choice(vehicle_types, num_rows),
    'Preparation_Time_min': np.random.randint(5, 60, num_rows),
    'Courier_Experience_yrs': np.random.uniform(0, 10, num_rows).round(1)
}

# Create DataFrame
newdata = pd.DataFrame(data)

# Display the first few rows
# print(df.head())


In [23]:
newdata

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,375.17,Cloudy,Medium,Afternoon,Car,48,1.0
1,950.76,Rainy,Low,Night,Truck,12,3.7
2,732.26,Foggy,Medium,Afternoon,Bike,28,6.7
3,599.06,Foggy,High,Afternoon,Scooter,15,6.7
4,156.86,Cloudy,High,Afternoon,Car,55,5.9
5,156.84,Foggy,Low,Night,Bike,21,2.7
6,59.03,Foggy,High,Afternoon,Scooter,12,5.6
7,866.31,Clear,High,Evening,Scooter,39,3.8
8,601.51,Cloudy,Medium,Night,Scooter,39,9.7
9,708.36,Snowy,Low,Evening,Bike,37,8.5


In [39]:
# Make predictions
predictions = loaded_model.predict(newdata)  # new_data should be a DataFrame with the same structure as the training data


In [40]:

results = newdata.copy()
results["Predicted_Delivery_Time_min"] = predictions

results


Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Predicted_Delivery_Time_min
0,375.17,Cloudy,Medium,Afternoon,Car,48,1.0,92.201561
1,950.76,Rainy,Low,Night,Truck,12,3.7,76.808644
2,732.26,Foggy,Medium,Afternoon,Bike,28,6.7,92.042659
3,599.06,Foggy,High,Afternoon,Scooter,15,6.7,86.318239
4,156.86,Cloudy,High,Afternoon,Car,55,5.9,92.859748
5,156.84,Foggy,Low,Night,Bike,21,2.7,87.379471
6,59.03,Foggy,High,Afternoon,Scooter,12,5.6,83.662944
7,866.31,Clear,High,Evening,Scooter,39,3.8,102.09063
8,601.51,Cloudy,Medium,Night,Scooter,39,9.7,87.350998
9,708.36,Snowy,Low,Evening,Bike,37,8.5,89.068223
