In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder,RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [None]:
# Load the dataset
df = pd.read_csv("../Dataset/Food_Delivery_Times.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [5]:
# Drop the Order_ID column as it's not useful for prediction
df = df.drop(columns=["Order_ID"])

In [6]:
df

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,19.03,Clear,Low,Morning,Bike,16,5.0,68
...,...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0,54
996,16.28,Rainy,Low,Morning,Scooter,8,9.0,71
997,15.62,Snowy,High,Evening,Scooter,26,2.0,81
998,14.17,Clear,Low,Afternoon,Bike,8,0.0,55


In [None]:
# Separate features and target
X = df.drop("Delivery_Time_min", axis=1)
y = df["Delivery_Time_min"]

In [8]:
print(X)
print(y)

     Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type  \
0           7.93   Windy           Low   Afternoon      Scooter   
1          16.42   Clear        Medium     Evening         Bike   
2           9.52   Foggy           Low       Night      Scooter   
3           7.44   Rainy        Medium   Afternoon      Scooter   
4          19.03   Clear           Low     Morning         Bike   
..           ...     ...           ...         ...          ...   
995         8.50   Clear          High     Evening          Car   
996        16.28   Rainy           Low     Morning      Scooter   
997        15.62   Snowy          High     Evening      Scooter   
998        14.17   Clear           Low   Afternoon         Bike   
999         6.63   Foggy           Low       Night      Scooter   

     Preparation_Time_min  Courier_Experience_yrs  
0                      12                     1.0  
1                      20                     2.0  
2                      28              

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()

In [34]:
numerical_cols

['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs']

In [40]:
# Preprocessing for numerical data
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler()),
    # ("poly", PolynomialFeatures(degree=4, include_bias=False))
    ("feature_selection", SelectKBest(score_func=f_regression, k=3)) 
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [41]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [42]:
def modelPipeline(modelR,X_train, X_test, y_train, y_test):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", modelR)
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results_df = pd.DataFrame({
        "Actual": y_test.values,
        "Predicted": y_pred
    })
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    metrics_df = pd.DataFrame({
        "MAE": [mae],
        "RMSE": [rmse],
        "R2": [r2],
        "CV_R2_Mean": [cv_scores.mean()],
        "CV_R2_Std": [cv_scores.std()]
    })

    # print("Model Evaluation Metrics:")
    # print(f"MAE: {mae:.2f}")
    # print(f"RMSE: {rmse:.2f}")
    # print(f"R2 Score: {r2:.2f}")
    # print(f"Cross-validated R²: {cv_scores.mean():.2f}")

    return metrics_df

In [43]:
modelPipeline(RandomForestRegressor(random_state=42),X_train, X_test, y_train, y_test)

Unnamed: 0,MAE,RMSE,R2,CV_R2_Mean,CV_R2_Std
0,6.84705,9.582953,0.795119,0.696034,0.052932


In [44]:
modelPipeline(GradientBoostingRegressor(random_state=42),X_train, X_test, y_train, y_test)

Unnamed: 0,MAE,RMSE,R2,CV_R2_Mean,CV_R2_Std
0,6.598749,9.389548,0.803306,0.71295,0.066977


In [45]:
modelPipeline(xgb.XGBRegressor(random_state=42, verbosity=0),X_train, X_test, y_train, y_test)

Unnamed: 0,MAE,RMSE,R2,CV_R2_Mean,CV_R2_Std
0,8.047232,10.949979,0.732497,0.650938,0.027185
