In [85]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
import xgboost as xgb

delivery_data = pl.read_csv('/Users/ryanquach/Downloads/deliverytime.csv')

In [86]:
delivery_data.head()

ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Type_of_order,Type_of_vehicle,Time_taken(min)
str,str,i64,f64,f64,f64,f64,f64,str,str,i64
"""4607""","""INDORES13DEL02…",37,4.9,22.745049,75.892471,22.765049,75.912471,"""Snack ""","""motorcycle """,24
"""B379""","""BANGRES18DEL02…",34,4.5,12.913041,77.683237,13.043041,77.813237,"""Snack ""","""scooter """,33
"""5D6D""","""BANGRES19DEL01…",23,4.4,12.914264,77.6784,12.924264,77.6884,"""Drinks ""","""motorcycle """,26
"""7A6A""","""COIMBRES13DEL0…",38,4.7,11.003669,76.976494,11.053669,77.026494,"""Buffet ""","""motorcycle """,21
"""70A2""","""CHENRES12DEL01…",32,4.6,12.972793,80.249982,13.012793,80.289982,"""Snack ""","""scooter """,30


In [87]:
def haversine(long_start, lat_start, long_end, lat_end):
    long_start, lat_start, long_end, lat_end = map(np.radians, [long_start, lat_start, long_end, lat_end])
    
    long_dist = long_end - long_start 
    lat_dist = lat_end - lat_start 
    a = np.sin(lat_dist/2) ** 2 + np.cos(lat_start) * np.cos(lat_end) * np.sin(long_dist/2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    
    km = 6371 * c
    return km

delivery_data = (delivery_data
                .select(pl.exclude(['ID', 'Delivery_person_ID']))
                .with_columns(pl.col('Type_of_order').str.to_lowercase())
                .select(pl.all().name.map(lambda col_name: col_name.lower()))
                .rename({'restaurant_latitude': 'orig_lat',
                         'restaurant_longitude': 'orig_long',
                         'delivery_location_latitude': 'dest_lat',
                         'delivery_location_longitude': 'dest_long',
                         'time_taken(min)': 'time_taken'})
                .with_columns(haversine(pl.col('orig_lat'), pl.col('orig_long'), 
                                        pl.col('dest_lat'), pl.col('dest_long')).alias('distance').round(2))
                .with_columns([pl.col('type_of_order').str.strip_chars(),
                               pl.col('type_of_vehicle').str.strip_chars()])
                .select(pl.exclude(['orig_lat', 'orig_long', 'dest_lat', 'dest_long']))
                .select(pl.exclude('time_taken'), pl.col('time_taken'))
                )

In [88]:
delivery_data.head()

delivery_person_age,delivery_person_ratings,type_of_order,type_of_vehicle,distance,time_taken
i64,f64,str,str,f64,i64
37,4.9,"""snack""","""motorcycle""",2.29,24
34,4.5,"""snack""","""scooter""",14.78,33
23,4.4,"""drinks""","""motorcycle""",1.14,26
38,4.7,"""buffet""","""motorcycle""",5.7,21
32,4.6,"""snack""","""scooter""",4.51,30


In [89]:
data_matrix = delivery_data.to_dummies(pl.selectors.string(), drop_first = True)

In [90]:
data_matrix.head(n = 10)

delivery_person_age,delivery_person_ratings,type_of_order_buffet,type_of_order_drinks,type_of_order_meal,type_of_vehicle_bicycle,type_of_vehicle_electric_scooter,type_of_vehicle_scooter,distance,time_taken
i64,f64,u8,u8,u8,u8,u8,u8,f64,i64
37,4.9,0,0,0,0,0,0,2.29,24
34,4.5,0,0,0,0,0,1,14.78,33
23,4.4,0,1,0,0,0,0,1.14,26
38,4.7,1,0,0,0,0,0,5.7,21
32,4.6,0,0,0,0,0,1,4.51,30
22,4.8,1,0,0,0,0,0,3.4,26
33,4.7,0,0,1,0,0,1,12.27,40
35,4.6,0,0,1,0,0,0,14.83,32
22,4.8,1,0,0,0,0,0,14.74,34
36,4.2,0,0,0,0,0,0,7.95,46


In [91]:
X = data_matrix.select(pl.exclude('time_taken')).to_numpy()
Y = data_matrix.select(pl.col('time_taken')).to_numpy()

In [92]:
mod_lm = LinearRegression()
mod_pred = cross_val_score(mod_lm, X, Y, scoring = 'neg_root_mean_squared_error', cv = 10)
mod_pred

array([-8.32524964, -8.34356557, -8.37533418, -8.18651471, -8.34011921,
       -8.47839206, -8.27939427, -8.2728591 , -8.46300842, -8.47027317])

In [93]:
mod_lasso = Lasso()
mod_pred = cross_val_score(mod_lasso, X, Y, scoring = 'neg_root_mean_squared_error', cv = 10)
mod_pred

array([-8.95637655, -9.05208362, -9.04958513, -8.88853084, -8.96229077,
       -9.01875726, -8.82599209, -8.90767089, -9.07696874, -9.00822028])

In [95]:
mod_ridge = Ridge()
mod_pred = cross_val_score(mod_ridge, X, Y, scoring = 'neg_root_mean_squared_error', cv = 10)
mod_pred

array([-8.3252488 , -8.34362846, -8.37535888, -8.18662878, -8.3400367 ,
       -8.47842379, -8.27926522, -8.27289537, -8.46298984, -8.47014022])