# DoorDash ETA Prediction

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import holidays
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("/kaggle/input/doordash-eta-prediction/historical_data.csv")

In [3]:
data.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


### Convert and Extract Datetime Features

In [6]:
# Convert datetime columns
data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
data['delivery_duration_minutes'] = (
    (data['actual_delivery_time'] - data['created_at']).dt.total_seconds() / 60
)

# Time-Based Features
data['hour'] = data['created_at'].dt.hour
data['day_of_week_num'] = data['created_at'].dt.dayofweek
data['is_weekend'] = data['day_of_week_num'].isin([5, 6]).astype(int)

# Holiday Indicator
us_holidays = holidays.US()
data['is_holiday'] = data['created_at'].dt.date.astype(str).isin(us_holidays).astype(int)

## Feature Engineering

In [7]:
data['total_busy_dashers'] = abs(data['total_busy_dashers'])  # Handle negative values
data['total_onshift_dashers'] = abs(data['total_onshift_dashers'])
data['dashers_per_order'] = data['total_onshift_dashers'] / (data['total_outstanding_orders'] + 1e-5)
data['%_dashers_avail'] = data['total_busy_dashers'] / (
    data['total_busy_dashers'] + data['total_onshift_dashers'] + 1e-5
)

In [8]:
data['price_range'] = data['max_item_price'] - data['min_item_price']
data['avg_item_price'] = data['subtotal'] / (data['total_items'] + 1e-5)
data['price_volatility'] = data['price_range'] / (data['avg_item_price'] + 1e-5)

In [9]:
# Interaction Features
data['order_intensity'] = data['total_outstanding_orders'] / (data['total_busy_dashers'] + 1e-5)
data['delivery_difficulty'] = data['order_intensity'] * data['estimated_store_to_consumer_driving_duration']

In [10]:
data['historical_avg_delivery_time'] = data.groupby(
    ['store_id', 'hour'])['delivery_duration_minutes'].transform('mean')

data['delivery_speed'] = data['historical_avg_delivery_time'] / (
    data['estimated_store_to_consumer_driving_duration'] / 60 + 1e-5)


In [11]:
data['log_subtotal'] = np.log1p(data['subtotal'])
data['log_outstanding_orders'] = np.log1p(data['total_outstanding_orders'].clip(lower=1e-5))

In [12]:
data = data.drop(columns=['created_at', 'actual_delivery_time'])

# Outlier Removal Using IQR Method

In [14]:
def remove_outliers_iqr(df, variables, threshold=1.5):
   
    for variable in variables:
        if variable in df.columns:
            Q1 = df[variable].quantile(0.25)
            Q3 = df[variable].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - (threshold * IQR)
            upper_bound = Q3 + (threshold * IQR)
            df = df[(df[variable] >= lower_bound) & (df[variable] <= upper_bound)]
    return df

# Define numerical columns with potential outliers
outlier_columns = [
    'subtotal', 'delivery_duration_minutes', 'max_item_price', 'price_range',
    'avg_item_price', 'price_volatility', 'delivery_speed'
]

# Remove outliers
data = remove_outliers_iqr(data, outlier_columns)

# Handling Missing Values in the Dataset
### Using KNN Imputer


In [15]:
def handle_missing_values(df, n_neighbors=5):
    
    # Handle numerical columns using KNN Imputer
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    
    # Handle categorical columns using mode imputation
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

# Apply missing value handling
data = handle_missing_values(data)
print("Missing values per column after imputation:")
print(data.isnull().sum())


Missing values per column after imputation:
market_id                                       0
store_id                                        0
store_primary_category                          0
order_protocol                                  0
total_items                                     0
subtotal                                        0
num_distinct_items                              0
min_item_price                                  0
max_item_price                                  0
total_onshift_dashers                           0
total_busy_dashers                              0
total_outstanding_orders                        0
estimated_order_place_duration                  0
estimated_store_to_consumer_driving_duration    0
delivery_duration_minutes                       0
hour                                            0
day_of_week_num                                 0
is_weekend                                      0
is_holiday                                      0
dasher

In [17]:
def optimized_label_encoding(df, cat_cols):
    le_dict = {} 
    
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        le_dict[col] = le 
    
    return df, le_dict
    
categorical_columns = ['store_primary_category']
data, encoders = optimized_label_encoding(data, categorical_columns)

data['store_primary_category'].unique()

array([ 4, 46, 35, 38, 58, 68, 15, 55, 20, 60, 13, 10, 45, 39, 18, 28, 33,
       50, 40, 70,  6,  7, 27, 59, 72, 22, 65, 16, 23, 62, 71, 57, 53, 66,
       42, 34, 11, 49, 52,  2, 24, 61, 54, 69, 44, 25, 47,  0, 12, 31, 29,
       17, 21, 32, 30, 14, 48, 51, 64, 63, 67, 56,  9, 26, 19,  1,  5, 37,
       43, 41,  8, 36,  3])

In [18]:
# Target and feature variables
X = data.drop(columns=['delivery_duration_minutes'])
y = data['delivery_duration_minutes']

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest Model for Predicting Delivery Duration

In [24]:
# Define the Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 8.02
Root Mean Squared Error (RMSE): 10.40


In [26]:
import time
from numba import njit, prange

In [29]:
# Sklearn
start = time.time()
mae_sklearn = mean_absolute_error(y_test, y_pred)
rmse_sklearn = np.sqrt(mean_squared_error(y_test, y_pred))
t_sklearn = time.time() - start
print(f"[Sklearn]   MAE: {mae_sklearn:.4f}  RMSE: {rmse_sklearn:.4f}  Time: {t_sklearn:.4f} sec")

# Numba
@njit(parallel=True)
def mae_numba(y_true, y_pred):
    error = 0.0
    for i in prange(len(y_true)):
        error += abs(y_true[i] - y_pred[i])
    return error / len(y_true)

@njit(parallel=True)
def rmse_numba(y_true, y_pred):
    total = 0.0
    for i in prange(len(y_true)):
        diff = y_true[i] - y_pred[i]
        total += diff * diff
    return np.sqrt(total / len(y_true))

start = time.time()
mae_n = mae_numba(y_test.values, y_pred)
rmse_n = rmse_numba(y_test.values, y_pred)
t_numba = time.time() - start
print(f"[Numba]     MAE: {mae_n:.4f}  RMSE: {rmse_n:.4f}  Time: {t_numba:.4f} sec")

print("Results match:", np.allclose(mae_sklearn, mae_n), np.allclose(rmse_sklearn, rmse_n))

[Sklearn]   MAE: 8.0174  RMSE: 10.3989  Time: 0.0031 sec
[Numba]     MAE: 8.0174  RMSE: 10.3989  Time: 0.8569 sec
Results match: True True


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from joblib import Parallel, delayed
import numpy as np
import time

# Assumes the following data already exists:
# X_train, X_test, y_train, y_test

# Function to train and evaluate a single Random Forest model
def train_rf(X_train, y_train, X_test, y_test, seed):
    model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return {"seed": seed, "mae": mae, "rmse": rmse}

seeds = [0, 1, 2]

# ✅ Parallel training using joblib
start_parallel = time.time()
results_parallel = Parallel(n_jobs=3)(
    delayed(train_rf)(X_train, y_train, X_test, y_test, seed) for seed in seeds
)
time_parallel = time.time() - start_parallel

# ✅ Sequential training using a regular for-loop
start_sequential = time.time()
results_sequential = []
for seed in seeds:
    results_sequential.append(train_rf(X_train, y_train, X_test, y_test, seed))
time_sequential = time.time() - start_sequential

# ✅ Output results for comparison
print("==== Parallel (n_jobs=3) ====")
for r in results_parallel:
    print(f"Seed: {r['seed']}  MAE: {r['mae']:.4f}  RMSE: {r['rmse']:.4f}")
print(f"Total time: {time_parallel:.2f} seconds\n")

print("==== Sequential (for-loop) ====")
for r in results_sequential:
    print(f"Seed: {r['seed']}  MAE: {r['mae']:.4f}  RMSE: {r['rmse']:.4f}")
print(f"Total time: {time_sequential:.2f} seconds")


  pid = os.fork()


==== Parallel (n_jobs=3) ====
Seed: 0  MAE: 8.0337  RMSE: 10.4160
Seed: 1  MAE: 8.0336  RMSE: 10.4184
Seed: 2  MAE: 8.0458  RMSE: 10.4334
Total time: 288.54 seconds

==== Sequential (for-loop) ====
Seed: 0  MAE: 8.0337  RMSE: 10.4160
Seed: 1  MAE: 8.0336  RMSE: 10.4184
Seed: 2  MAE: 8.0458  RMSE: 10.4334
Total time: 275.71 seconds


In [31]:
import time
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ------- Version 1: n_jobs=None -------
start_time = time.time()

rf_model_single = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=None  # default: single thread
)

rf_model_single.fit(X_train, y_train)
y_pred_single = rf_model_single.predict(X_test)

time_single = time.time() - start_time
mae_single = mean_absolute_error(y_test, y_pred_single)
rmse_single = np.sqrt(mean_squared_error(y_test, y_pred_single))

print("=== Random Forest (n_jobs=None) ===")
print(f"Training Time: {time_single:.2f} seconds")
print(f"MAE: {mae_single:.2f}")
print(f"RMSE: {rmse_single:.2f}\n")


# ------- Version 2: n_jobs=-1 -------
start_time = time.time()

rf_model_parallel = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1  # all available cores
)

rf_model_parallel.fit(X_train, y_train)
y_pred_parallel = rf_model_parallel.predict(X_test)

time_parallel = time.time() - start_time
mae_parallel = mean_absolute_error(y_test, y_pred_parallel)
rmse_parallel = np.sqrt(mean_squared_error(y_test, y_pred_parallel))

print("=== Random Forest (n_jobs=-1) ===")
print(f"Training Time: {time_parallel:.2f} seconds")
print(f"MAE: {mae_parallel:.2f}")
print(f"RMSE: {rmse_parallel:.2f}")

=== Random Forest (n_jobs=None) ===
Training Time: 250.53 seconds
MAE: 8.03
RMSE: 10.42

=== Random Forest (n_jobs=-1) ===
Training Time: 91.79 seconds
MAE: 8.03
RMSE: 10.42
