# Import Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [2]:
import dagshub
dagshub.init(repo_owner='kbs.kartik', repo_name='delivery-time-prediction', mlflow=True)

In [3]:
import mlflow

In [4]:
# set the tracking server

mlflow.set_tracking_uri("https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/")

In [5]:
# mlflow experiment

mlflow.set_experiment("Exp 2 - Model Selection")

2025/07/05 16:12:54 INFO mlflow.tracking.fluent: Experiment with name 'Exp 2 - Model Selection' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/0b7e63ca46e74df4b09dd1d762779c73', creation_time=1751712178535, experiment_id='2', last_update_time=1751712178535, lifecycle_stage='active', name='Exp 2 - Model Selection', tags={}>

In [6]:
from sklearn import set_config

set_config(transform_output="pandas")

# Load the Data

In [7]:
# load the data

df = pd.read_csv(r'C:\Users\KARTIK\Documents\delivery-time-prediction\data\raw\swiggy.csv')

# Clean Data

In [8]:
import numpy as np
import pandas as pd


columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]


def change_column_names(data: pd.DataFrame):
    return (
        data.rename(str.lower,axis=1).rename({
            "delivery_person_id" : "rider_id",
            "delivery_person_age": "age",
            "delivery_person_ratings": "ratings",
            "delivery_location_latitude": "delivery_latitude",
            "delivery_location_longitude": "delivery_longitude",
            "time_orderd": "order_time",
            "time_order_picked": "order_picked_time",
            "weatherconditions": "weather",
            "road_traffic_density": "traffic",
            "city": "city_type",
            "time_taken(min)": "time_taken"},
            axis=1)
    )


def data_cleaning(data: pd.DataFrame):
    minors_data = data.loc[data['age'].astype('float') < 18]
    minor_index = minors_data.index.tolist()
    six_star_data = data.loc[data['ratings'] == "6"]
    six_star_index = six_star_data.index.tolist()

    return (
        data
        .drop(columns="id")
        .drop(index=minor_index)                                                # Minor riders in data dropped
        .drop(index=six_star_index)                                             # six star rated drivers dropped
        .replace("NaN ",np.nan)                                                 # missing values in the data
        .assign(
            # city column out of rider id
            city_name = lambda x: x['rider_id'].str.split("RES").str.get(0),
            # convert age to float
            age = lambda x: x['age'].astype(float),
            # convert ratings to float
            ratings = lambda x: x['ratings'].astype(float),
            # absolute values for location based columns
            restaurant_latitude = lambda x: x['restaurant_latitude'].abs(),
            restaurant_longitude = lambda x: x['restaurant_longitude'].abs(),
            delivery_latitude = lambda x: x['delivery_latitude'].abs(),
            delivery_longitude = lambda x: x['delivery_longitude'].abs(),
            # order date to datetime and feature extraction
            order_date = lambda x: pd.to_datetime(x['order_date'],
                                                  dayfirst=True),
            order_day = lambda x: x['order_date'].dt.day,
            order_month = lambda x: x['order_date'].dt.month,
            order_day_of_week = lambda x: x['order_date'].dt.day_name().str.lower(),
            is_weekend = lambda x: (x['order_date']
                                    .dt.day_name()
                                    .isin(["Saturday","Sunday"])
                                    .astype(int)),
            # time based columns
            order_time = lambda x: pd.to_datetime(x['order_time'],
                                                  format='mixed'),
            order_picked_time = lambda x: pd.to_datetime(x['order_picked_time'],
                                                         format='mixed'),
            # time taken to pick order
            pickup_time_minutes = lambda x: (
                                            (x['order_picked_time'] - x['order_time'])
                                            .dt.seconds / 60
                                            ),
            # hour in which order was placed
            order_time_hour = lambda x: x['order_time'].dt.hour,
            # time of the day when order was placed
            order_time_of_day = lambda x: (
                                x['order_time_hour'].pipe(time_of_day)),
            # categorical columns
            weather = lambda x: (
                                x['weather']
                                .str.replace("conditions ","")
                                .str.lower()
                                .replace("nan",np.nan)),
            traffic = lambda x: x["traffic"].str.rstrip().str.lower(),
            type_of_order = lambda x: x['type_of_order'].str.rstrip().str.lower(),
            type_of_vehicle = lambda x: x['type_of_vehicle'].str.rstrip().str.lower(),
            festival = lambda x: x['festival'].str.rstrip().str.lower(),
            city_type = lambda x: x['city_type'].str.rstrip().str.lower(),
            # multiple deliveries column
            multiple_deliveries = lambda x: x['multiple_deliveries'].astype(float),
            # target column modifications
            time_taken = lambda x: (x['time_taken']
                                     .str.replace("(min) ","")
                                     .astype(int)))
        .drop(columns=["order_time","order_picked_time"])
    )
    
    
    
def clean_lat_long(data: pd.DataFrame, threshold=1):
    location_columns = ['restaurant_latitude',
                        'restaurant_longitude',
                        'delivery_latitude',
                        'delivery_longitude']

    return (
        data
        .assign(**{
            col: (
                np.where(data[col] < threshold, np.nan, data[col].values)
            )
            for col in location_columns
        })
    )
    
    
# extract day, day name, month and year
def extract_datetime_features(ser):
    date_col = pd.to_datetime(ser,dayfirst=True)

    return (
        pd.DataFrame(
            {
                "day": date_col.dt.day,
                "month": date_col.dt.month,
                "year": date_col.dt.year,
                "day_of_week": date_col.dt.day_name(),
                "is_weekend": date_col.dt.day_name().isin(["Saturday","Sunday"]).astype(int)
            }
        ))
    
    
def time_of_day(ser):

    return(
        pd.cut(ser,bins=[0,6,12,17,20,24],right=True,
               labels=["after_midnight","morning","afternoon","evening","night"])
    )


def drop_columns(data: pd.DataFrame, columns: list) -> pd.DataFrame:
    df = data.drop(columns=columns)
    return df


def calculate_haversine_distance(df):
    location_columns = ['restaurant_latitude',
                        'restaurant_longitude',
                        'delivery_latitude',
                        'delivery_longitude']
    
    lat1 = df[location_columns[0]]
    lon1 = df[location_columns[1]]
    lat2 = df[location_columns[2]]
    lon2 = df[location_columns[3]]

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(
        dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    distance = 6371 * c

    return (
        df.assign(
            distance = distance)
    )

def create_distance_type(data: pd.DataFrame):
    return(
        data
        .assign(
                distance_type = pd.cut(data["distance"],bins=[0,5,10,15,25],
                                        right=False,labels=["short","medium","long","very_long"])
    ))


def perform_data_cleaning(data: pd.DataFrame):
    
    cleaned_data = (
        data
        .pipe(change_column_names)
        .pipe(data_cleaning)
        .pipe(clean_lat_long)
        .pipe(calculate_haversine_distance)
        .pipe(create_distance_type)
        .pipe(drop_columns,columns=columns_to_drop)
    )
    
    return cleaned_data,cleaned_data.dropna()
    
    

if __name__ == "__main__":
    # data path for data
    #DATA_PATH = "swiggy.csv"
    
    # read the data from path
    #df = pd.read_csv(DATA_PATH)
    #print('swiggy data loaded successfuly')
    
    df_with_nans, df_without_nans = perform_data_cleaning(df)

# Drop Missing values

In [9]:
temp_df = df_without_nans.copy().dropna()

In [10]:
# split into X and y

X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']

X

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,is_weekend,pickup_time_minutes,order_time_of_day,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,1,15.0,morning,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,0,5.0,evening,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,1,15.0,morning,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,0,10.0,evening,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,1,15.0,afternoon,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45587,35.0,4.2,windy,jam,2,drinks,motorcycle,1.0,no,metropolitian,0,10.0,night,16.600272,very_long
45588,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,0,10.0,morning,1.489846,short
45590,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,0,15.0,night,4.657195,short
45591,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,0,5.0,afternoon,6.232393,medium


In [11]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

The size of train data is (30156, 15)
The shape of test data is (7539, 15)


In [13]:
# missing values in train data

X_train.isna().sum()

age                    0
ratings                0
weather                0
traffic                0
vehicle_condition      0
type_of_order          0
type_of_vehicle        0
multiple_deliveries    0
festival               0
city_type              0
is_weekend             0
pickup_time_minutes    0
order_time_of_day      0
distance               0
distance_type          0
dtype: int64

In [14]:
# transform target column

pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [16]:
# percentage of rows in data having missing values

(
    X_train
    .isna()
    .any(axis=1)
    .mean()
    .round(2) * 100
)


np.float64(0.0)

# Pre Processing Pipeline

In [17]:
num_cols = ["age","ratings","pickup_time_minutes","distance"]

nominal_cat_cols = ['weather',
                    'type_of_order',
                    'type_of_vehicle',
                    "festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [18]:
nominal_cat_cols

['weather',
 'type_of_order',
 'type_of_vehicle',
 'festival',
 'city_type',
 'is_weekend',
 'order_time_of_day']

In [19]:
X_train.isna().sum()

age                    0
ratings                0
weather                0
traffic                0
vehicle_condition      0
type_of_order          0
type_of_vehicle        0
multiple_deliveries    0
festival               0
city_type              0
is_weekend             0
pickup_time_minutes    0
order_time_of_day      0
distance               0
distance_type          0
dtype: int64

In [20]:
# # features to fill values with mode

# features_to_fill_mode = ['multiple_deliveries','festival','city_type']
# features_to_fill_missing = [col for col in nominal_cat_cols if col not in features_to_fill_mode]

# features_to_fill_missing

In [21]:
# # simple imputer to fill categorical vars with mode

# simple_imputer = ColumnTransformer(transformers=[
#     ("mode_imputer",SimpleImputer(strategy="most_frequent",add_indicator=True),features_to_fill_mode),
#     ("missing_imputer",SimpleImputer(strategy="constant",fill_value="missing",add_indicator=True),features_to_fill_missing)
# ],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)

# simple_imputer

In [22]:
# simple_imputer.fit_transform(X_train)

In [23]:
# simple_imputer.fit_transform(X_train).isna().sum()

In [24]:
# knn imputer

# knn_imputer = KNNImputer(n_neighbors=5)

In [25]:
# do basic preprocessing

num_cols = ["age","ratings","pickup_time_minutes","distance"]

nominal_cat_cols = ['weather','type_of_order',
                    'type_of_vehicle',"festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [26]:
# generate order for ordinal encoding

traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [27]:
# unique categories the ordinal columns

for col in ordinal_cat_cols:
    print(col,X_train[col].unique())

traffic ['jam' 'medium' 'high' 'low']
distance_type ['medium', 'short', 'long', 'very_long']
Categories (4, object): ['short' < 'medium' < 'long' < 'very_long']


In [28]:
# build a preprocessor

preprocessor = ColumnTransformer(transformers=[
    ("scale", MinMaxScaler(), num_cols),
    ("nominal_encode", OneHotEncoder(drop="first",handle_unknown="ignore",
                                     sparse_output=False), nominal_cat_cols),
    ("ordinal_encode", OrdinalEncoder(categories=[traffic_order,distance_type_order],
                                      encoded_missing_value=-999,
                                      handle_unknown="use_encoded_value",
                                      unknown_value=-1), ordinal_cat_cols)
],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)


preprocessor

0,1,2
,transformers,"[('scale', ...), ('nominal_encode', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['low', 'medium', ...], ['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-999
,min_frequency,
,max_categories,


In [29]:
# build the pipeline

processing_pipeline = Pipeline(steps=[
                                # ("simple_imputer",simple_imputer),
                                ("preprocess",preprocessor)
                                # ("knn_imputer",knn_imputer)
                            ])

processing_pipeline

0,1,2
,steps,"[('preprocess', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scale', ...), ('nominal_encode', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['low', 'medium', ...], ['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-999
,min_frequency,
,max_categories,


In [30]:
# do data preprocessing

X_train_trans = processing_pipeline.fit_transform(X_train)

X_test_trans = processing_pipeline.transform(X_test)



In [31]:
X_train_trans

Unnamed: 0,age,ratings,pickup_time_minutes,distance,weather_fog,weather_sandstorms,weather_stormy,weather_sunny,weather_windy,type_of_order_drinks,...,city_type_semi-urban,city_type_urban,is_weekend_1,order_time_of_day_evening,order_time_of_day_morning,order_time_of_day_night,traffic,distance_type,vehicle_condition,multiple_deliveries
8720,0.473684,0.56,1.0,0.404165,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0,2.0
25245,1.000000,0.76,0.0,0.154044,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,1.0
34118,0.473684,0.80,0.5,0.002461,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1,0.0
26036,1.000000,0.92,1.0,0.460411,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0,1.0
37194,0.526316,0.76,0.5,0.243676,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20275,0.578947,0.92,0.5,0.451895,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,0,0.0
7601,0.052632,1.00,1.0,0.612270,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,1,1.0
13632,0.526316,0.92,0.0,0.322877,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1,0.0
1045,0.947368,0.96,0.5,0.004486,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,1.0


In [32]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
from sklearn.metrics import r2_score, mean_absolute_error

In [34]:
def objective(trial):
    with mlflow.start_run(nested=True):
        model_name = trial.suggest_categorical("model",["SVM","RF","KNN","GB","XGB","LGBM"])

        if model_name == "SVM":
            kernel_svm = trial.suggest_categorical("kernel_svm",["linear","poly","rbf"])
            if kernel_svm == "linear":
                c_linear = trial.suggest_float("c_linear",0,10)
                model = SVR(C=c_linear,kernel="linear")

            elif kernel_svm == "poly":
                c_poly = trial.suggest_float("c_poly",0,10)
                degree_poly = trial.suggest_int("degree_poly",1,5)
                model = SVR(C=c_poly,degree=degree_poly,
                            kernel="poly")

            else:
                c_rbf = trial.suggest_float("c_rbf",0,100)
                gamma_rbf = trial.suggest_float("gamma_rbf",0,10)
                model = SVR(C=c_rbf,gamma=gamma_rbf,
                            kernel="rbf")

        elif model_name == "RF":
            n_estimators_rf = trial.suggest_int("n_estimators_rf",10,200)
            max_depth_rf = trial.suggest_int("max_depth_rf",2,20)
            model = RandomForestRegressor(n_estimators=n_estimators_rf,
                                        max_depth=max_depth_rf,
                                        random_state=42,
                                        n_jobs=-1)

        elif model_name == "GB":
            n_estimators_gb = trial.suggest_int("n_estimators_gb",10,200)
            learning_rate_gb = trial.suggest_float("learning_rate_gb",0,1)
            max_depth_gb = trial.suggest_int("max_depth_gb",2,20)
            model = GradientBoostingRegressor(n_estimators=n_estimators_gb,
                                                learning_rate=learning_rate_gb,
                                                max_depth=max_depth_gb,
                                                random_state=42)

        elif model_name == "KNN":
            n_neighbors_knn = trial.suggest_int("n_neighbors_knn",1,25)
            weights_knn = trial.suggest_categorical("weights_knn",["uniform","distance"])
            model = KNeighborsRegressor(n_neighbors=n_neighbors_knn,
                                        weights=weights_knn,n_jobs=-1)

        elif model_name == "XGB":
            n_estimators_xgb = trial.suggest_int("n_estimators_xgb",10,200)
            learning_rate_xgb = trial.suggest_float("learning_rate_xgb",0.1,0.5)
            max_depth_xgb = trial.suggest_int("max_depth_xgb",2,20)
            model = XGBRegressor(n_estimators=n_estimators_xgb,
                                    learning_rate=learning_rate_xgb,
                                    max_depth=max_depth_xgb,
                                    random_state=42,
                                    n_jobs=-1)

        elif model_name == "LGBM":
            n_estimators_lgbm = trial.suggest_int("n_estimators_lgbm",10,200)
            learning_rate_lgbm = trial.suggest_float("learning_rate_lgbm",0.1,0.5)
            max_depth_lgbm = trial.suggest_int("max_depth_lgbm",2,20)
            model = LGBMRegressor(n_estimators=n_estimators_lgbm,
                                    learning_rate=learning_rate_lgbm,
                                    max_depth=max_depth_lgbm,
                                    random_state=42)


        # train the model
        model.fit(X_train_trans,y_train_pt.values.ravel())

        # log model params
        mlflow.log_params(model.get_params())

        # get the predictions
        y_pred_train = model.predict(X_train_trans)
        y_pred_test = model.predict(X_test_trans)

        # get the actual predictions values
        y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
        y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))

        # calculate the error
        error = mean_absolute_error(y_test,y_pred_test_org)

        # log model_name
        mlflow.log_param("model",model_name)

        # log error
        mlflow.log_metric("MAE",error)

        return error

In [35]:
# create optuna study
study = optuna.create_study(direction="minimize",study_name="model_selection")

with mlflow.start_run(run_name="Best Model") as parent:
    # optimize the objective function
    study.optimize(objective,n_trials=30,n_jobs=-1)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

[I 2025-07-05 16:19:48,997] A new study created in memory with name: model_selection


üèÉ View run selective-hound-953 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/cdafeb3d0d544ce49f3c14cd47cbf0f7
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:20:04,839] Trial 11 finished with value: 4.256543359524491 and parameters: {'model': 'KNN', 'n_neighbors_knn': 12, 'weights_knn': 'uniform'}. Best is trial 11 with value: 4.256543359524491.


üèÉ View run enchanting-fawn-423 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/894761447ac94f489e91a62fd24ef9af
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:20:10,734] Trial 12 finished with value: 5.865281828967605 and parameters: {'model': 'RF', 'n_estimators_rf': 32, 'max_depth_rf': 2}. Best is trial 11 with value: 4.256543359524491.


üèÉ View run carefree-shrike-595 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/3e67e844d99e4b388e8048225987372a
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:20:17,029] Trial 7 finished with value: 3.280879167791238 and parameters: {'model': 'LGBM', 'n_estimators_lgbm': 32, 'learning_rate_lgbm': 0.37164934065756317, 'max_depth_lgbm': 4}. Best is trial 7 with value: 3.280879167791238.


üèÉ View run gentle-lark-510 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/4ed219ad3fe24da6b1facdd78a0401ba
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:20:23,021] Trial 9 finished with value: 3.1757572337423166 and parameters: {'model': 'GB', 'n_estimators_gb': 142, 'learning_rate_gb': 0.3459560679425243, 'max_depth_gb': 4}. Best is trial 9 with value: 3.1757572337423166.


üèÉ View run orderly-ram-170 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/e4931b04ce614506bd6bd7c595b92bf8
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run serious-wolf-738 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/922e4f0af1a04e059c9fc49319d28f0c
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run brawny-ant-359 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/0f4255cc43f14ec1ad9e6e00db9cc238
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:20:49,946] Trial 6 finished with value: 3.0746649099202585 and parameters: {'model': 'LGBM', 'n_estimators_lgbm': 134, 'learning_rate_lgbm': 0.2583305451075149, 'max_depth_lgbm': 9}. Best is trial 6 with value: 3.0746649099202585.
[I 2025-07-05 16:20:50,967] Trial 4 finished with value: 3.212613372162881 and parameters: {'model': 'GB', 'n_estimators_gb': 157, 'learning_rate_gb': 0.5022887749604897, 'max_depth_gb': 5}. Best is trial 6 with value: 3.0746649099202585.
[I 2025-07-05 16:20:51,911] Trial 10 finished with value: 3.2012410595279404 and parameters: {'model': 'GB', 'n_estimators_gb': 157, 'learning_rate_gb': 0.4944858511151814, 'max_depth_gb': 5}. Best is trial 6 with value: 3.0746649099202585.


üèÉ View run powerful-auk-330 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/398e876a26284998978313a70aa5d49f
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:21:06,774] Trial 15 finished with value: 3.0650902462183103 and parameters: {'model': 'RF', 'n_estimators_rf': 167, 'max_depth_rf': 16}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run mercurial-sloth-498 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/754931878b084b0ba9823cafb529a5e3
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run angry-duck-463 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/dfd3f915e4504a8e8a93087c13f745a3
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:21:20,953] Trial 14 finished with value: 3.3160626888275146 and parameters: {'model': 'XGB', 'n_estimators_xgb': 83, 'learning_rate_xgb': 0.38502676411584413, 'max_depth_xgb': 13}. Best is trial 15 with value: 3.0650902462183103.
[I 2025-07-05 16:21:23,964] Trial 1 finished with value: 3.2115797996520996 and parameters: {'model': 'XGB', 'n_estimators_xgb': 167, 'learning_rate_xgb': 0.17680512754436895, 'max_depth_xgb': 13}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run flawless-wolf-81 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/2288e35128824b4ca45821306f6f9631
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:21:34,008] Trial 0 finished with value: 3.3255529403686523 and parameters: {'model': 'XGB', 'n_estimators_xgb': 80, 'learning_rate_xgb': 0.18043363815911906, 'max_depth_xgb': 20}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run serious-bird-247 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/34d68c47d5d4423893a7ac056bc88caf
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run nebulous-squid-891 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/f779dc2ddce545c5853bddf7d9f1385b
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run serious-snail-541 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/cb3eff7535d44674a65bef608c1be70b
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:21:38,997] Trial 16 finished with value: 4.718999958841066 and parameters: {'model': 'KNN', 'n_neighbors_knn': 2, 'weights_knn': 'distance'}. Best is trial 15 with value: 3.0650902462183103.
[I 2025-07-05 16:21:40,033] Trial 13 finished with value: 3.652743504826753 and parameters: {'model': 'GB', 'n_estimators_gb': 93, 'learning_rate_gb': 0.6673055106352541, 'max_depth_gb': 16}. Best is trial 15 with value: 3.0650902462183103.
[I 2025-07-05 16:21:40,960] Trial 17 finished with value: 3.650558483447045 and parameters: {'model': 'GB', 'n_estimators_gb': 26, 'learning_rate_gb': 0.04845878997571451, 'max_depth_gb': 10}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run treasured-mare-482 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/24f7f055853e492ab6f8ed1c15f29d71
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:21:54,176] Trial 20 finished with value: 3.079497462767803 and parameters: {'model': 'RF', 'n_estimators_rf': 196, 'max_depth_rf': 20}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run popular-foal-830 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/ffa16470faea4ff2ac102292592d55d3
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:22:03,064] Trial 21 finished with value: 3.078948833212855 and parameters: {'model': 'RF', 'n_estimators_rf': 187, 'max_depth_rf': 20}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run gregarious-zebra-348 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/8eb46dc22dd84735be54a10f01fdf7ae
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:22:08,108] Trial 22 finished with value: 3.079151970723889 and parameters: {'model': 'RF', 'n_estimators_rf': 195, 'max_depth_rf': 20}. Best is trial 15 with value: 3.0650902462183103.


üèÉ View run resilient-conch-702 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/36186a0f8809453f98611ae83613f6d1
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2
üèÉ View run stylish-bug-121 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/1fc20b0f119e479ab43a67d350972dd5
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:22:26,957] Trial 19 finished with value: 3.6305185443908425 and parameters: {'model': 'GB', 'n_estimators_gb': 199, 'learning_rate_gb': 0.7520306912628969, 'max_depth_gb': 6}. Best is trial 15 with value: 3.0650902462183103.
[I 2025-07-05 16:22:29,280] Trial 24 finished with value: 3.026907253255544 and parameters: {'model': 'LGBM', 'n_estimators_lgbm': 178, 'learning_rate_lgbm': 0.11688924080837143, 'max_depth_lgbm': 16}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run bright-fly-228 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/2cad657ccb664714923f67e1c74bf026
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:22:32,980] Trial 26 finished with value: 3.0299581003818417 and parameters: {'model': 'LGBM', 'n_estimators_lgbm': 179, 'learning_rate_lgbm': 0.13330903069845249, 'max_depth_lgbm': 16}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run classy-smelt-527 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/955979bb028a4a13a0e6315f72d7fa38
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:22:36,990] Trial 18 finished with value: 3.4573138113644926 and parameters: {'model': 'GB', 'n_estimators_gb': 134, 'learning_rate_gb': 0.46199018300109085, 'max_depth_gb': 15}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run bold-hen-436 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/dda2342e185748768cb4f9addc50bc69
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:26:03,152] Trial 3 finished with value: 4.675408769001732 and parameters: {'model': 'SVM', 'kernel_svm': 'linear', 'c_linear': 1.6516146937824283}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run stately-sow-222 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/7696b75a377a42f3ad8cc76561c5d024
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:26:14,870] Trial 27 finished with value: 4.675378892825293 and parameters: {'model': 'SVM', 'kernel_svm': 'poly', 'c_poly': 5.596373535823074, 'degree_poly': 1}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run nosy-hog-281 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/d25d73930e13475a87d2649978fa4de2
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:28:28,384] Trial 2 finished with value: 4.675498605114556 and parameters: {'model': 'SVM', 'kernel_svm': 'linear', 'c_linear': 3.7877138036139044}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run mercurial-cub-685 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/2533e5e1ab794d5cb04e5254d356a648
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:31:04,167] Trial 5 finished with value: 3.73584416028952 and parameters: {'model': 'SVM', 'kernel_svm': 'poly', 'c_poly': 5.490436430728728, 'degree_poly': 3}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run industrious-ray-205 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/1b0cc74679694b7f963b9e50e2ef7d17
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:31:55,680] Trial 8 finished with value: 4.675444373555786 and parameters: {'model': 'SVM', 'kernel_svm': 'linear', 'c_linear': 7.108033721060027}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run capable-slug-402 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/769e38eaae7e49e5aa3887e8203809ba
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:35:14,953] Trial 25 finished with value: 4.675456807102963 and parameters: {'model': 'SVM', 'kernel_svm': 'linear', 'c_linear': 9.142753907702918}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run smiling-tern-219 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/68033646a782447aaa46a9d7f267f938
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:36:26,103] Trial 29 finished with value: 6.3933756472981065 and parameters: {'model': 'SVM', 'kernel_svm': 'rbf', 'c_rbf': 74.73501831233327, 'gamma_rbf': 5.638517959674813}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run carefree-moth-384 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/2e9fc82a6d854a1bb02819383c291b10
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:36:54,049] Trial 28 finished with value: 5.827440786072113 and parameters: {'model': 'SVM', 'kernel_svm': 'rbf', 'c_rbf': 57.64664934925733, 'gamma_rbf': 2.8840985200599496}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run judicious-perch-7 at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/7ef000ddf76d4b2ea7abf6703d9b1c29
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


[I 2025-07-05 16:38:57,176] Trial 23 finished with value: 4.964488071747157 and parameters: {'model': 'SVM', 'kernel_svm': 'rbf', 'c_rbf': 89.63939458442059, 'gamma_rbf': 1.6290381841955357}. Best is trial 24 with value: 3.026907253255544.


üèÉ View run Best Model at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2/runs/151ba0311c3244a4be837198221f56d0
üß™ View experiment at: https://dagshub.com/kbs.kartik/delivery-time-prediction.mlflow/#/experiments/2


In [36]:
# best score

study.best_value

3.026907253255544

In [37]:
lgbm_params = {
    "n_estimators": 145,
    "learning_rate": 0.16632111599858262,
    "max_depth": 17
}

In [38]:
# train the model on best parameters

lgbm = LGBMRegressor(**lgbm_params)

lgbm.fit(X_train_trans,y_train_pt.values.ravel())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 30156, number of used features: 25
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,17
,learning_rate,0.16632111599858262
,n_estimators,145
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [39]:
# get the predictions
y_pred_train = lgbm.predict(X_train_trans)
y_pred_test = lgbm.predict(X_test_trans)

In [40]:
# get the actual predictions values

y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))

In [41]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The train error is {mean_absolute_error(y_train,y_pred_train_org):.2f} minutes")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test_org):.2f} minutes")

The train error is 2.78 minutes
The test error is 3.02 minutes


In [42]:
print(f"The train r2 score is {r2_score(y_train,y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test_org):.2f}")

The train r2 score is 0.86
The test r2 score is 0.84


In [43]:
# dataframe of results

study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_c_linear,params_c_poly,params_c_rbf,params_degree_poly,params_gamma_rbf,...,params_max_depth_rf,params_max_depth_xgb,params_model,params_n_estimators_gb,params_n_estimators_lgbm,params_n_estimators_rf,params_n_estimators_xgb,params_n_neighbors_knn,params_weights_knn,state
0,0,3.325553,2025-07-05 16:19:49.924442,2025-07-05 16:21:34.008356,0 days 00:01:44.083914,,,,,,...,,20.0,XGB,,,,80.0,,,COMPLETE
1,1,3.21158,2025-07-05 16:19:49.928435,2025-07-05 16:21:23.962359,0 days 00:01:34.033924,,,,,,...,,13.0,XGB,,,,167.0,,,COMPLETE
2,2,4.675499,2025-07-05 16:19:49.929234,2025-07-05 16:28:28.384511,0 days 00:08:38.455277,3.787714,,,,,...,,,SVM,,,,,,,COMPLETE
3,3,4.675409,2025-07-05 16:19:49.929234,2025-07-05 16:26:03.150745,0 days 00:06:13.221511,1.651615,,,,,...,,,SVM,,,,,,,COMPLETE
4,4,3.212613,2025-07-05 16:19:49.933758,2025-07-05 16:20:50.967654,0 days 00:01:01.033896,,,,,,...,,,GB,157.0,,,,,,COMPLETE
5,5,3.735844,2025-07-05 16:19:49.937316,2025-07-05 16:31:04.167374,0 days 00:11:14.230058,,5.490436,,3.0,,...,,,SVM,,,,,,,COMPLETE
6,6,3.074665,2025-07-05 16:19:49.938458,2025-07-05 16:20:49.946526,0 days 00:01:00.008068,,,,,,...,,,LGBM,,134.0,,,,,COMPLETE
7,7,3.280879,2025-07-05 16:19:49.940938,2025-07-05 16:20:17.029796,0 days 00:00:27.088858,,,,,,...,,,LGBM,,32.0,,,,,COMPLETE
8,8,4.675444,2025-07-05 16:19:49.940938,2025-07-05 16:31:55.680032,0 days 00:12:05.739094,7.108034,,,,,...,,,SVM,,,,,,,COMPLETE
9,9,3.175757,2025-07-05 16:19:49.945434,2025-07-05 16:20:23.021177,0 days 00:00:33.075743,,,,,,...,,,GB,142.0,,,,,,COMPLETE


In [44]:
# model frequency

study.trials_dataframe()['params_model'].value_counts()

params_model
SVM     9
GB      7
RF      5
LGBM    4
XGB     3
KNN     2
Name: count, dtype: int64

In [45]:
# avg scores for all tested models

study.trials_dataframe().groupby("params_model")['value'].mean().sort_values()

params_model
LGBM    3.103102
XGB     3.284398
GB      3.425821
RF      3.633594
KNN     4.487772
SVM     4.922037
Name: value, dtype: float64

In [46]:
from sklearn.compose import TransformedTargetRegressor

model = TransformedTargetRegressor(regressor=lgbm,
                                    transformer=pt)

In [47]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,
                         X_train_trans,
                         y_train,
                         scoring="neg_mean_absolute_error",
                         cv=5,n_jobs=-1)

scores

array([-3.06435673, -3.04327813, -3.0700516 , -3.06852842, -3.05699881])

In [48]:
# mean score

- scores.mean()

np.float64(3.060642738025659)