In [None]:
!pip install catboost

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import holidays
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer

import time
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [3]:
data = pd.read_csv("historical_data.csv")
data

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2015-02-17 00:19:41,2015-02-17 01:24:48,2956,fast,4.0,3,1389,3,345,649,17.0,17.0,23.0,251,331.0
197424,1.0,2015-02-13 00:01:59,2015-02-13 00:58:22,2956,fast,4.0,6,3010,4,405,825,12.0,11.0,14.0,251,915.0
197425,1.0,2015-01-24 04:46:08,2015-01-24 05:36:16,2956,fast,4.0,5,1836,3,300,399,39.0,41.0,40.0,251,795.0
197426,1.0,2015-02-01 18:18:15,2015-02-01 19:23:22,3630,sandwich,1.0,1,1175,1,535,535,7.0,7.0,12.0,446,384.0


In [24]:
data.describe()

Unnamed: 0,market_id,store_id,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
count,196441.0,197428.0,196433.0,197428.0,197428.0,197428.0,197428.0,197428.0,181166.0,181166.0,181166.0,197428.0,196902.0
mean,2.978706,3530.510272,2.882352,3.196391,2682.331402,2.670791,686.21847,1159.58863,44.808093,41.739747,58.050065,308.560179,545.358935
std,1.524867,2053.496711,1.503771,2.666546,1823.093688,1.630255,522.038648,558.411377,34.526783,32.145733,52.66183,90.139653,219.352902
min,1.0,1.0,1.0,1.0,0.0,1.0,-86.0,0.0,-4.0,-5.0,-6.0,0.0,0.0
25%,2.0,1686.0,1.0,2.0,1400.0,1.0,299.0,800.0,17.0,15.0,17.0,251.0,382.0
50%,3.0,3592.0,3.0,3.0,2200.0,2.0,595.0,1095.0,37.0,34.0,41.0,251.0,544.0
75%,4.0,5299.0,4.0,4.0,3395.0,3.0,949.0,1395.0,65.0,62.0,85.0,446.0,702.0
max,6.0,6987.0,7.0,411.0,27100.0,20.0,14700.0,14700.0,171.0,154.0,285.0,2715.0,2088.0


### Feature Engineering

In [4]:
# Convert datetime columns
data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
data['delivery_duration_minutes'] = (
    (data['actual_delivery_time'] - data['created_at']).dt.total_seconds() / 60
)

# Time-Based Features
data['hour'] = data['created_at'].dt.hour
data['day_of_week_num'] = data['created_at'].dt.dayofweek
data['is_weekend'] = data['day_of_week_num'].isin([5, 6]).astype(int)

# Holiday Indicator
us_holidays = holidays.US()
data['is_holiday'] = data['created_at'].dt.date.astype(str).isin(us_holidays).astype(int)

# Dasher Features
data['total_busy_dashers'] = abs(data['total_busy_dashers'])  # Handle negative values
data['total_onshift_dashers'] = abs(data['total_onshift_dashers'])
data['dashers_per_order'] = data['total_onshift_dashers'] / (data['total_outstanding_orders'] + 1e-5)
data['%_dashers_avail'] = data['total_busy_dashers'] / (
    data['total_busy_dashers'] + data['total_onshift_dashers'] + 1e-5
)

# Price-Based Features
data['price_range'] = data['max_item_price'] - data['min_item_price']
data['avg_item_price'] = data['subtotal'] / (data['total_items'] + 1e-5)
data['price_volatility'] = data['price_range'] / (data['avg_item_price'] + 1e-5)

# Interaction Features
data['order_intensity'] = data['total_outstanding_orders'] / (data['total_busy_dashers'] + 1e-5)
data['delivery_difficulty'] = data['order_intensity'] * data['estimated_store_to_consumer_driving_duration']

# Delivery Speed
data['historical_avg_delivery_time'] = data.groupby(
    ['store_id', 'hour'])['delivery_duration_minutes'].transform('mean')
data['delivery_speed'] = data['historical_avg_delivery_time'] / (
    data['estimated_store_to_consumer_driving_duration'] / 60 + 1e-5)

# Log Transformations
data['log_subtotal'] = np.log1p(data['subtotal'])
data['log_outstanding_orders'] = np.log1p(data['total_outstanding_orders'].clip(lower=1e-5))

data = data.drop(columns=['created_at', 'actual_delivery_time'])

In [5]:
data

Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,...,%_dashers_avail,price_range,avg_item_price,price_volatility,order_intensity,delivery_difficulty,historical_avg_delivery_time,delivery_speed,log_subtotal,log_outstanding_orders
0,1.0,1845,american,1.0,4,3441,4,557,1239,33.0,...,0.297872,682,860.247849,0.792795,1.499999,1291.499078,62.983333,4.389079,8.143808,3.091042
1,2.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,...,0.666664,0,1899.981000,0.000000,0.999995,689.996550,59.158333,5.144198,7.550135,1.098612
2,3.0,5477,,1.0,1,1900,1,1900,1900,1.0,...,0.000000,0,1899.981000,0.000000,0.000000,0.000000,34.008333,2.957244,7.550135,0.000010
3,3.0,5477,,1.0,6,6900,5,600,1800,1.0,...,0.499998,1200,1149.998083,1.043480,1.999980,577.994220,59.158333,12.281981,8.839422,1.098612
4,3.0,5477,,1.0,3,3900,3,1100,1600,6.0,...,0.500000,500,1299.995667,0.384617,1.499998,974.998375,33.116667,3.056920,8.268988,2.302585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2956,fast,4.0,3,1389,3,345,649,17.0,...,0.500000,304,462.998457,0.656590,1.352940,447.823266,58.338889,10.575006,7.237059,3.178054
197424,1.0,2956,fast,4.0,6,3010,4,405,825,12.0,...,0.478261,420,501.665831,0.837211,1.272726,1164.544396,58.338889,3.825498,8.010028,2.708050
197425,1.0,2956,fast,4.0,5,1836,3,300,399,39.0,...,0.512500,99,367.199266,0.269608,0.975610,775.609567,39.683333,2.994966,7.515889,3.713572
197426,1.0,3630,sandwich,1.0,1,1175,1,535,535,7.0,...,0.500000,0,1174.988250,0.000000,1.714283,658.284774,65.116667,10.174463,7.069874,2.564949


### Remove outliers

In [6]:
def remove_outliers_iqr(df, variables, threshold=1.5):

    for variable in variables:
        if variable in df.columns:
            Q1 = df[variable].quantile(0.25)
            Q3 = df[variable].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - (threshold * IQR)
            upper_bound = Q3 + (threshold * IQR)
            df = df[(df[variable] >= lower_bound) & (df[variable] <= upper_bound)]
    return df

outlier_columns = [
    'subtotal', 'delivery_duration_minutes', 'max_item_price', 'price_range',
    'avg_item_price', 'price_volatility', 'delivery_speed'
]
data = remove_outliers_iqr(data, outlier_columns)

### Optimized Label Encoding

In [7]:
def optimized_label_encoding(df, cat_cols):
    le_dict = {}

    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        le_dict[col] = le

    return df, le_dict

categorical_columns = ['store_primary_category']
data, encoders = optimized_label_encoding(data, categorical_columns)

data['store_primary_category'].unique()

array([ 4, 46, 49, 35, 38, 59, 69, 15, 56, 20, 61, 13, 10, 45, 39, 18, 28,
       33, 51, 40, 71,  6,  7, 27, 60, 73, 22, 66, 16, 23, 63, 72, 58, 54,
       67, 42, 34, 11, 50, 53,  2, 24, 62, 55, 70, 44, 25, 47,  0, 12, 31,
       29, 17, 21, 32, 30, 14, 48, 52, 65, 64, 68, 57,  9, 26, 19,  1,  5,
       37, 43, 41,  8, 36,  3])

### Training

In [8]:
# Target and feature variables
X = data.drop(columns=['delivery_duration_minutes'])
y = data['delivery_duration_minutes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
!pip install line_profiler

Collecting line_profiler
  Downloading line_profiler-4.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading line_profiler-4.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (750 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/750.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.2/750.2 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: line_profiler
Successfully installed line_profiler-4.2.0


In [21]:
from line_profiler import LineProfiler

In [32]:
# -----------------------------
# BASELINE (SEQUENTIAL)
# -----------------------------
def evaluate_baseline(name, model):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - start_time

    mae = mean_absolute_error(y_test, y_pred)

    baseline_results[name] = {
        'MAE': round(mae, 4),
        'Train Time (s)': round(train_time, 4),
        'Predict Time (s)': round(pred_time, 4)
    }

# -----------------------------
# MODEL DEFINITIONS
# -----------------------------
model_defs = {
    "XGBoost": XGBRegressor(n_estimators=100, verbosity=0, n_jobs=1),
    "LightGBM": LGBMRegressor(n_estimators=100, n_jobs=1),
    "CatBoost_CPU": CatBoostRegressor(n_estimators=100, verbose=0, task_type='CPU'),
    "CatBoost_GPU": CatBoostRegressor(n_estimators=100, verbose=0, task_type='GPU'),
    "HistGradientBoosting": HistGradientBoostingRegressor(max_iter=100)
}

# -----------------------------
# RUN BASELINE
# -----------------------------
print("Running baseline (sequential)...")
lp = LineProfiler()
lp.add_function(evaluate_baseline)

baseline_results = {}
lp.enable()

for name, model in model_defs.items():
    evaluate_baseline(name, model)

lp.disable()
lp.print_stats()

baseline_df = pd.DataFrame(baseline_results).T

print("\n=== Baseline Timing ===")
print(baseline_df)

Running baseline (sequential)...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4830
[LightGBM] [Info] Number of data points in the train set: 125315, number of used features: 28
[LightGBM] [Info] Start training from score 45.038075




Timer unit: 1e-09 s

Total time: 16.0661 s
File: <ipython-input-32-a2455e7c15c9>
Function: evaluate_baseline at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
     4                                           def evaluate_baseline(name, model):
     5         5      13189.0   2637.8      0.0      start_time = time.time()
     6         5        2e+10    3e+09     96.3      model.fit(X_train, y_train)
     7         5      21740.0   4348.0      0.0      train_time = time.time() - start_time
     8                                           
     9         5       2669.0    533.8      0.0      start_time = time.time()
    10         5  584962547.0    1e+08      3.6      y_pred = model.predict(X_test)
    11         5      23440.0   4688.0      0.0      pred_time = time.time() - start_time
    12                                           
    13         5    9142366.0    2e+06      0.1      mae = mean_absolute_error(y_test, y_pred)
    14                             

In [34]:
from joblib import Parallel, delayed
from line_profiler import LineProfiler

# -----------------------------
# OPTIMIZED PARALLEL EVALUATION FUNCTION
# -----------------------------
def evaluate_optimized(name, model):
    train_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_start

    pred_start = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - pred_start

    mae = mean_absolute_error(y_test, y_pred)

    return {
        'Model': name,
        'MAE': round(mae, 4),
        'Train Time (s)': round(train_time, 4),
        'Predict Time (s)': round(pred_time, 4)
    }

# -----------------------------
# MODEL DEFINITIONS
# -----------------------------
model_defs = {
    "XGBoost": XGBRegressor(n_estimators=100, verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(n_estimators=100, n_jobs=-1),
    "CatBoost_CPU": CatBoostRegressor(n_estimators=100, verbose=0, task_type='CPU'),
    "CatBoost_GPU": CatBoostRegressor(n_estimators=100, verbose=0, task_type='GPU'),
    "HistGradientBoosting": HistGradientBoostingRegressor(max_iter=100)
}

# -----------------------------
# RUN OPTIMIZED MODELS IN PARALLEL
# -----------------------------
print("Running optimized models in parallel...")

# lp = LineProfiler()
# lp.add_function(evaluate_optimized)
# lp.enable()

results = Parallel(n_jobs=-1, backend='loky')(
    delayed(evaluate_optimized)(name, model)
    for name, model in model_defs.items()
)

# lp.disable()
# lp.print_stats()

# Convert list of dicts to DataFrame
optimized_df = pd.DataFrame(results).set_index("Model")

print("\n=== Optimized Timing Results ===")
print(optimized_df)

Running optimized models in parallel...

=== Optimized Timing Results ===
                         MAE  Train Time (s)  Predict Time (s)
Model                                                         
XGBoost               7.8710          6.5193            0.2190
LightGBM              7.8241         10.8059            0.2912
CatBoost_CPU          7.8381          5.8012            0.0227
CatBoost_GPU          7.8357          1.9133            0.0311
HistGradientBoosting  7.8339          3.1680            0.2642
