In [22]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import hopsworks

In [23]:

# Log in to the Hopsworks project
project = hopsworks.login()
# Get the feature store associated with the project
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903317
Connected. Call `.close()` to terminate connection gracefully.


In [24]:
# Retrieve final
final_data = fs.get_feature_group('final_data', version=1)

In [25]:
# Select the query
query = final_data.select_all()

In [26]:
# Read all data
final_merge = query.read(read_options={"use_hive": True})


Finished: Reading data from Hopsworks, using Hive (15.12s) 


In [27]:
# First five rows
final_merge.head()

Unnamed: 0,unique_id,truck_id,route_id,departure_date,estimated_arrival,delay,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,...,driver_id,name,gender,age,experience,driving_style,ratings,vehicle_no,average_speed_mph,is_midnight
0,3725,10497416,R-2c5432ed,2019-02-10 07:00:00,2019-02-12 16:54:00,1,28.818182,6.727273,0.0,51.272727,...,3fff1b6e-e,Brandon Cruz,male,53,23,proactive,6,10497416,59.77,1
1,4299,28978466,R-f8640cff,2019-01-19 07:00:00,2019-01-19 12:44:24,0,46.0,9.0,0.0,75.0,...,c3362ffc-f,Brian Garcia,male,49,5,conservative,8,28978466,45.73,0
2,4448,12911518,R-18468971,2019-01-25 07:00:00,2019-01-25 23:40:12,1,54.0,9.75,0.0,52.25,...,0ce6d439-4,Jeremy Hurst PhD,male,57,19,proactive,7,12911518,60.84,0
3,9783,61984883,R-d87e53cd,2019-02-09 07:00:00,2019-02-09 20:38:24,0,80.5,10.0,0.0,63.5,...,42aa7479-5,Jerry Powers,male,41,7,conservative,2,61984883,56.94,0
4,12209,30312694,R-483bf9db,2019-01-16 07:00:00,2019-01-16 09:03:36,0,67.5,7.5,0.0,88.5,...,94ed3e6d-f,Manuel Wise,male,48,9,proactive,7,30312694,57.36,0


### Data Processing

In [28]:
# Basic Information on the dataframe
final_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12308 entries, 0 to 12307
Data columns (total 49 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   unique_id                       12308 non-null  int64         
 1   truck_id                        12308 non-null  int64         
 2   route_id                        12308 non-null  object        
 3   departure_date                  12308 non-null  datetime64[ns]
 4   estimated_arrival               12308 non-null  datetime64[ns]
 5   delay                           12308 non-null  int64         
 6   route_avg_temp                  12308 non-null  float64       
 7   route_avg_wind_speed            12308 non-null  float64       
 8   route_avg_precip                12308 non-null  float64       
 9   route_avg_humidity              12308 non-null  float64       
 10  route_avg_visibility            12308 non-null  float64       
 11  ro

In [29]:
# Number of null values
final_merge.isna().sum()

unique_id                           0
truck_id                            0
route_id                            0
departure_date                      0
estimated_arrival                   0
delay                               0
route_avg_temp                      0
route_avg_wind_speed                0
route_avg_precip                    0
route_avg_humidity                  0
route_avg_visibility                0
route_avg_pressure                  0
route_description                   0
estimated_arrival_nearest_hour      0
departure_date_nearest_hour         0
origin_id                           0
destination_id                      0
distance                            0
average_hours                       0
origin_temp                         4
origin_wind_speed                   4
origin_description                  0
origin_precip                       4
origin_humidity                     4
origin_visibility                   4
origin_pressure                     4
destination_

In [30]:
# Let's check the rows where origin temp is null
final_merge[final_merge['origin_temp'].isnull()]

Unnamed: 0,unique_id,truck_id,route_id,departure_date,estimated_arrival,delay,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,...,driver_id,name,gender,age,experience,driving_style,ratings,vehicle_no,average_speed_mph,is_midnight
566,7661,18091756,R-112b790b,2019-01-25 07:00:00,2019-01-27 02:40:48,1,66.555556,6.888889,0.0,90.888889,...,e975a383-c,Neil Herring,male,45,7,proactive,3,18091756,58.02,1
3766,8163,24746768,R-b5f9418a,2019-01-25 07:00:00,2019-01-27 14:35:24,0,47.454545,9.090909,0.0,70.636364,...,3d91387f-2,William Anderson III,male,50,0,conservative,4,24746768,40.69,1
9720,11359,22916520,R-78ee1f97,2019-01-25 07:00:00,2019-01-28 10:08:24,0,57.5,10.142857,0.0,78.214286,...,ffedbf74-a,Thomas Ochoa,male,57,19,proactive,6,22916520,63.64,1
12077,7721,24654257,R-21472caf,2019-01-25 07:00:00,2019-01-27 16:50:24,0,69.0,12.363636,0.018182,79.181818,...,f110642c-1,Marc Walters,male,47,5,proactive,3,24654257,61.93,1


In [31]:
# Let's check the rows where origin humidity is null
# Looks like we have null values in the same rows, let's find out which origin city is this
final_merge[final_merge['origin_humidity'].isnull()]

Unnamed: 0,unique_id,truck_id,route_id,departure_date,estimated_arrival,delay,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,...,driver_id,name,gender,age,experience,driving_style,ratings,vehicle_no,average_speed_mph,is_midnight
566,7661,18091756,R-112b790b,2019-01-25 07:00:00,2019-01-27 02:40:48,1,66.555556,6.888889,0.0,90.888889,...,e975a383-c,Neil Herring,male,45,7,proactive,3,18091756,58.02,1
3766,8163,24746768,R-b5f9418a,2019-01-25 07:00:00,2019-01-27 14:35:24,0,47.454545,9.090909,0.0,70.636364,...,3d91387f-2,William Anderson III,male,50,0,conservative,4,24746768,40.69,1
9720,11359,22916520,R-78ee1f97,2019-01-25 07:00:00,2019-01-28 10:08:24,0,57.5,10.142857,0.0,78.214286,...,ffedbf74-a,Thomas Ochoa,male,57,19,proactive,6,22916520,63.64,1
12077,7721,24654257,R-21472caf,2019-01-25 07:00:00,2019-01-27 16:50:24,0,69.0,12.363636,0.018182,79.181818,...,f110642c-1,Marc Walters,male,47,5,proactive,3,24654257,61.93,1


In [32]:
# Fetch the routes data
routes_data = fs.get_feature_group('routes_details_fg', version=1)

routes_data_query = routes_data.select_all()

routes_df = routes_data_query.read(read_options={"use_hive": True})


Finished: Reading data from Hopsworks, using Hive (1.26s) 


In [33]:
# Find the rows with the routes ids which has no info on origin city's weather on 25th jan
# Only 1 city is there in all these rows
routes_df[routes_df.route_id.isin(['R-112b790b', 'R-78ee1f97','R-b5f9418a', 'R-21472caf'])]

Unnamed: 0,route_id,origin_id,destination_id,distance,average_hours,event_time
6,R-b5f9418a,C-f8f01604,C-4fe0fa24,2779.33,55.59,2023-08-23
438,R-21472caf,C-f8f01604,C-2e349ccd,2892.14,57.84,2023-08-23
702,R-112b790b,C-f8f01604,C-d3bb431c,2183.94,43.68,2023-08-23
1289,R-78ee1f97,C-f8f01604,C-f5ed4c15,3757.02,75.14,2023-08-23


In [34]:
# Let's check if we have any information on this city
# Fetching the weather data
weather_data = fs.get_feature_group('city_weather_details_fg', version=1)

weather_query = weather_data.select_all()

weather_df = weather_query.read(read_options={"use_hive": True})


Finished: Reading data from Hopsworks, using Hive (5.45s) 


In [35]:
# Filter the weather data with city and date
# We don't have any information on this, we will remove these rows
# It is important to check with the business regarding the information though 
weather_df[(weather_df.city_id=='C-f8f01604')&(weather_df.date==pd.to_datetime('2019-01-25'))]

Unnamed: 0,city_id,date,hour,temp,wind_speed,description,precip,humidity,visibility,pressure,chanceofrain,chanceoffog,chanceofsnow,chanceofthunder


In [36]:
# Drop the rows

final_merge=final_merge.dropna(subset =  ['origin_temp', 'origin_wind_speed', 'origin_precip',
                                'origin_humidity', 'origin_visibility', 'origin_pressure' ] ).reset_index(drop=True)

In [37]:
# Let's verify the dropped null values
final_merge.isna().sum()

unique_id                           0
truck_id                            0
route_id                            0
departure_date                      0
estimated_arrival                   0
delay                               0
route_avg_temp                      0
route_avg_wind_speed                0
route_avg_precip                    0
route_avg_humidity                  0
route_avg_visibility                0
route_avg_pressure                  0
route_description                   0
estimated_arrival_nearest_hour      0
departure_date_nearest_hour         0
origin_id                           0
destination_id                      0
distance                            0
average_hours                       0
origin_temp                         0
origin_wind_speed                   0
origin_description                  0
origin_precip                       0
origin_humidity                     0
origin_visibility                   0
origin_pressure                     0
destination_

In [38]:
final_merge

Unnamed: 0,unique_id,truck_id,route_id,departure_date,estimated_arrival,delay,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,...,driver_id,name,gender,age,experience,driving_style,ratings,vehicle_no,average_speed_mph,is_midnight
0,3725,10497416,R-2c5432ed,2019-02-10 07:00:00,2019-02-12 16:54:00,1,28.818182,6.727273,0.000000,51.272727,...,3fff1b6e-e,Brandon Cruz,male,53,23,proactive,6,10497416,59.77,1
1,4299,28978466,R-f8640cff,2019-01-19 07:00:00,2019-01-19 12:44:24,0,46.000000,9.000000,0.000000,75.000000,...,c3362ffc-f,Brian Garcia,male,49,5,conservative,8,28978466,45.73,0
2,4448,12911518,R-18468971,2019-01-25 07:00:00,2019-01-25 23:40:12,1,54.000000,9.750000,0.000000,52.250000,...,0ce6d439-4,Jeremy Hurst PhD,male,57,19,proactive,7,12911518,60.84,0
3,9783,61984883,R-d87e53cd,2019-02-09 07:00:00,2019-02-09 20:38:24,0,80.500000,10.000000,0.000000,63.500000,...,42aa7479-5,Jerry Powers,male,41,7,conservative,2,61984883,56.94,0
4,12209,30312694,R-483bf9db,2019-01-16 07:00:00,2019-01-16 09:03:36,0,67.500000,7.500000,0.000000,88.500000,...,94ed3e6d-f,Manuel Wise,male,48,9,proactive,7,30312694,57.36,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12299,5411,31312028,R-20f95619,2019-01-13 07:00:00,2019-01-13 14:50:24,0,60.666667,16.333333,0.000000,60.000000,...,a17d5cbf-7,Kenneth Johnson,male,50,7,proactive,8,31312028,62.11,0
12300,621,22913195,R-00b69c21,2019-01-19 07:00:00,2019-01-19 10:13:12,0,56.500000,9.500000,0.050000,79.000000,...,4b6a3c38-a,Roy Banks,male,54,25,conservative,7,22913195,48.23,0
12301,8806,78735626,R-3c2b451d,2019-01-26 07:00:00,2019-01-27 16:05:24,0,55.000000,4.857143,0.000000,65.428571,...,3e53cc64-c,Geoffrey Barber,male,44,4,proactive,7,78735626,61.18,1
12302,9405,18493697,R-5c9a7270,2019-02-05 07:00:00,2019-02-06 09:24:00,1,74.833333,5.166667,0.016667,63.333333,...,2ac3deaa-5,Michael Barber,male,64,30,proactive,6,18493697,57.13,1


In [39]:
#selecting necessary columns and removing id columns

# final_merge.select_dtypes(include='object').columns
# final_merge.select_dtypes(exclude='object').columns

cts_cols=['route_avg_temp', 'route_avg_wind_speed',
       'route_avg_precip', 'route_avg_humidity', 'route_avg_visibility',
       'route_avg_pressure', 'distance', 'average_hours',
       'origin_temp', 'origin_wind_speed', 'origin_precip', 'origin_humidity',
       'origin_visibility', 'origin_pressure',
       'destination_temp','destination_wind_speed','destination_precip',
       'destination_humidity', 'destination_visibility','destination_pressure',
        'avg_no_of_vehicles', 'truck_age','load_capacity_pounds', 'mileage_mpg',
        'age', 'experience','average_speed_mph']


cat_cols=['route_description',
       'origin_description', 'destination_description',
        'accident', 'fuel_type',
       'gender', 'driving_style', 'ratings','is_midnight']


target=['delay']



In [40]:
# Checking the date range
final_merge['estimated_arrival'].min(), final_merge['estimated_arrival'].max()

(Timestamp('2019-01-01 07:04:48'), Timestamp('2019-02-14 16:06:00'))

In [41]:
# Splitting the data into training, validation, and test sets based on date

# An entire month of january for training 
train_df = final_merge[final_merge['estimated_arrival'] <= pd.to_datetime('2019-01-30')]

# Next week for validating
validation_df = final_merge[(final_merge['estimated_arrival'] > pd.to_datetime('2019-01-30')) &

                            (final_merge['estimated_arrival'] <= pd.to_datetime('2019-02-07'))]

# Last week for training
test_df = final_merge[final_merge['estimated_arrival'] > pd.to_datetime('2019-02-07')]

In [42]:
X_train=train_df[cts_cols+cat_cols]

y_train=train_df['delay']



In [43]:
validation_df

Unnamed: 0,unique_id,truck_id,route_id,departure_date,estimated_arrival,delay,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,...,driver_id,name,gender,age,experience,driving_style,ratings,vehicle_no,average_speed_mph,is_midnight
6,3923,22421199,R-cfa0867d,2019-02-06 07:00:00,2019-02-06 08:22:48,1,71.500000,4.500000,0.050000,81.000000,...,cbd29c89-3,Justin Johnson,male,42,10,proactive,7,22421199,60.27,0
7,8,35477069,R-54510e55,2019-01-29 07:00:00,2019-02-03 13:30:00,0,66.913043,5.782609,0.000000,72.086957,...,e06e8543-b,Jeremy Thomas,male,48,7,conservative,8,35477069,38.65,1
10,1852,32255566,R-800e2702,2019-02-03 07:00:00,2019-02-03 15:41:24,1,76.666667,7.666667,0.000000,81.666667,...,11ecb456-e,Michael Jones,male,45,11,conservative,4,32255566,35.32,0
11,3342,28973978,R-899d9b94,2019-02-06 07:00:00,2019-02-06 22:31:48,0,42.000000,6.250000,0.000000,76.750000,...,2f6b4176-3,Danny Short,male,49,1,proactive,3,28973978,57.87,0
19,7292,27384735,R-ece28f05,2019-02-03 07:00:00,2019-02-04 05:02:24,1,68.400000,7.200000,0.000000,96.400000,...,466eb5da-5,Jeremiah Edwards,male,40,2,conservative,3,27384735,51.78,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12292,11341,21260443,R-ef208446,2019-01-31 07:00:00,2019-01-31 20:54:00,1,49.500000,12.250000,0.000000,67.750000,...,cd0764ef-b,Chad Harper,male,54,17,proactive,8,21260443,59.56,0
12294,8275,12642421,R-05a6b7e1,2019-02-06 07:00:00,2019-02-06 14:28:48,0,46.000000,5.000000,0.000000,50.000000,...,bc8ebe9f-5,Jeffery Johnson,male,55,13,conservative,5,12642421,56.84,0
12297,5955,14730346,R-c6fece3d,2019-01-31 07:00:00,2019-01-31 16:30:36,0,42.333333,7.666667,0.033333,85.000000,...,09665f38-8,Robert Russell DDS,male,40,6,conservative,2,14730346,46.49,0
12298,7936,10234289,R-b05ef8c9,2019-01-31 07:00:00,2019-01-31 12:55:12,0,55.333333,4.333333,0.000000,39.000000,...,5050d6b3-b,Maurice Howe,male,53,26,proactive,6,10234289,61.97,0


In [44]:
X_valid = validation_df[cts_cols + cat_cols]

y_valid = validation_df['delay']

X_test=test_df[cts_cols+cat_cols]

y_test=test_df['delay']

In [45]:
load_capacity_mode = X_train['load_capacity_pounds'].mode()

load_capacity_mode

0    3000.0
Name: load_capacity_pounds, dtype: float64

In [46]:
X_train['load_capacity_pounds']=X_train['load_capacity_pounds'].fillna(load_capacity_mode.iloc[0])
X_valid['load_capacity_pounds']=X_valid['load_capacity_pounds'].fillna(load_capacity_mode.iloc[0])
X_test['load_capacity_pounds']=X_test['load_capacity_pounds'].fillna(load_capacity_mode.iloc[0])

In [47]:
X_train.isna().sum()

route_avg_temp             0
route_avg_wind_speed       0
route_avg_precip           0
route_avg_humidity         0
route_avg_visibility       0
route_avg_pressure         0
distance                   0
average_hours              0
origin_temp                0
origin_wind_speed          0
origin_precip              0
origin_humidity            0
origin_visibility          0
origin_pressure            0
destination_temp           0
destination_wind_speed     0
destination_precip         0
destination_humidity       0
destination_visibility     0
destination_pressure       0
avg_no_of_vehicles         0
truck_age                  0
load_capacity_pounds       0
mileage_mpg                0
age                        0
experience                 0
average_speed_mph          0
route_description          0
origin_description         0
destination_description    0
accident                   0
fuel_type                  0
gender                     0
driving_style              0
ratings       

In [48]:
X_valid.isna().sum()

route_avg_temp             0
route_avg_wind_speed       0
route_avg_precip           0
route_avg_humidity         0
route_avg_visibility       0
route_avg_pressure         0
distance                   0
average_hours              0
origin_temp                0
origin_wind_speed          0
origin_precip              0
origin_humidity            0
origin_visibility          0
origin_pressure            0
destination_temp           0
destination_wind_speed     0
destination_precip         0
destination_humidity       0
destination_visibility     0
destination_pressure       0
avg_no_of_vehicles         0
truck_age                  0
load_capacity_pounds       0
mileage_mpg                0
age                        0
experience                 0
average_speed_mph          0
route_description          0
origin_description         0
destination_description    0
accident                   0
fuel_type                  0
gender                     0
driving_style              0
ratings       

In [49]:
X_test.isna().sum()


route_avg_temp             0
route_avg_wind_speed       0
route_avg_precip           0
route_avg_humidity         0
route_avg_visibility       0
route_avg_pressure         0
distance                   0
average_hours              0
origin_temp                0
origin_wind_speed          0
origin_precip              0
origin_humidity            0
origin_visibility          0
origin_pressure            0
destination_temp           0
destination_wind_speed     0
destination_precip         0
destination_humidity       0
destination_visibility     0
destination_pressure       0
avg_no_of_vehicles         0
truck_age                  0
load_capacity_pounds       0
mileage_mpg                0
age                        0
experience                 0
average_speed_mph          0
route_description          0
origin_description         0
destination_description    0
accident                   0
fuel_type                  0
gender                     0
driving_style              0
ratings       

In [50]:
# Importing Standard Scaler and One-Hot Encoder
from sklearn.preprocessing import OneHotEncoder
from pickle import dump


In [51]:
#! pip install streamlit==1.29.0 joblib==1.3.2 wandb==0.16.1 xgboost==2.0.2 scikit_learn==1.2.2
#! pip install pandas==1.5.3

In [52]:
# Importing Standard Scaler and One-Hot Encoder
from sklearn.preprocessing import OneHotEncoder
from pickle import dump


In [53]:
# Creating the One-Hot Encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [54]:
# Specifying columns to be encoded
encode_columns = ['route_description', 'origin_description', 'destination_description', 'fuel_type', 'gender', 'driving_style']

In [55]:
# Fitting the encoder on the training data
encoder.fit(X_train[encode_columns])

In [56]:
# Generating names for the new one-hot encoded features
encoded_features = list(encoder.get_feature_names_out(encode_columns))

In [57]:
encoded_features

['route_description_Blizzard',
 'route_description_Blowing snow',
 'route_description_Clear',
 'route_description_Cloudy',
 'route_description_Fog',
 'route_description_Freezing drizzle',
 'route_description_Freezing fog',
 'route_description_Heavy rain',
 'route_description_Heavy rain at times',
 'route_description_Heavy snow',
 'route_description_Light drizzle',
 'route_description_Light freezing rain',
 'route_description_Light rain',
 'route_description_Light rain shower',
 'route_description_Light sleet',
 'route_description_Light sleet showers',
 'route_description_Light snow',
 'route_description_Mist',
 'route_description_Moderate or heavy freezing rain',
 'route_description_Moderate or heavy rain shower',
 'route_description_Moderate or heavy rain with thunder',
 'route_description_Moderate or heavy sleet',
 'route_description_Moderate or heavy sleet showers',
 'route_description_Moderate or heavy snow showers',
 'route_description_Moderate or heavy snow with thunder',
 'route

In [58]:
# Transforming the training, validation, and test sets

X_train[encoded_features] = encoder.transform(X_train[encode_columns])

X_valid[encoded_features] = encoder.transform(X_valid[encode_columns])

X_test[encoded_features] = encoder.transform(X_test[encode_columns])

In [59]:
# Dumping the encoder for future use
dump(encoder, open('truck_data_encoder.pkl', 'wb'))

In [60]:
# Dropping the original categorical features

X_train = X_train.drop(encode_columns, axis=1)

X_valid = X_valid.drop(encode_columns, axis=1)

X_test = X_test.drop(encode_columns, axis=1)

### Scaling

In [61]:
# Import Scaler
from sklearn.preprocessing import StandardScaler

In [62]:
scaler = StandardScaler()

In [63]:
# Scale Separate Columns

# train

X_train[cts_cols] = scaler.fit_transform(X_train[cts_cols])

In [64]:
# valid

X_valid[cts_cols] = scaler.transform(X_valid[cts_cols])


# test

X_test[cts_cols] = scaler.transform(X_test[cts_cols])

In [65]:
# Dump the scaler to use in transforming test data

dump(scaler, open('truck_data_scaler.pkl', 'wb'))

### Connecting to WAB

In [66]:
# Import Libraries
import wandb
import joblib
import os
import pandas as pd
import credentials
import wandb
#! wandb login <>

In [67]:
wandb.login()

True

In [68]:
# constants for interacting with W&B

USER_NAME = credentials.wb_username
PROJECT_NAME = credentials.wb_proj_name

In [69]:
# Importing training libraries and evaluation metrics

from sklearn.metrics import f1_score, recall_score, confusion_matrix, roc_auc_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

In [70]:
# Evaluation function
# #Columns needed to compare metrics
comparison_columns = ['Model_Name', 'Train_F1score', 'Train_Recall', 'Valid_F1score', 'Valid_Recall', 'Test_F1score', 'Test_Recall']

comparison_df = pd.DataFrame()



def evaluate_models(model_name, model_defined_var, X_train, y_train, X_valid, y_valid, X_test, y_test):
  ''' This function predicts and evaluates various models for classification'''

  # train predictions
  y_train_pred = model_defined_var.predict(X_train)
  # train performance
  train_f1_score = f1_score(y_train, y_train_pred)
  train_recall = recall_score(y_train, y_train_pred)

  # validation predictions
  y_valid_pred = model_defined_var.predict(X_valid)
  # validation performance
  valid_f1_score = f1_score(y_valid, y_valid_pred)
  valid_recall = recall_score(y_valid, y_valid_pred)

  # test predictions
  y_pred = model_defined_var.predict(X_test)
  # test performance
  test_f1_score = f1_score(y_test, y_pred)
  test_recall = recall_score(y_test, y_pred)

  # Printing performance
  print("Train Results")
  print(f'F1 Score: {train_f1_score}')
  print(f'Recall Score: {train_recall}')
  print(f'Confusion Matrix: \n{confusion_matrix(y_train, y_train_pred)}')
  print(f'Area Under Curve: {roc_auc_score(y_train, y_train_pred)}')

  print(" ")

  print("Validation Results")
  print(f'F1 Score: {valid_f1_score}')
  print(f'Recall Score: {valid_recall}')
  print(f'Confusion Matrix: \n{confusion_matrix(y_valid, y_valid_pred)}')
  print(f'Area Under Curve: {roc_auc_score(y_valid, y_valid_pred)}')

  print(" ")

  print("Test Results")
  print(f'F1 Score: {test_f1_score}')
  print(f'Recall Score: {test_recall}')
  print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
  print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')

  # Saving our results
  global comparison_columns
  metric_scores = [model_name, train_f1_score, train_recall, valid_f1_score, valid_recall, test_f1_score, test_recall]
  final_dict = dict(zip(comparison_columns, metric_scores))
  return final_dict


final_list = []
def add_dic_to_final_df(final_dict):
  global final_list
  final_list.append(final_dict)
  global comparison_df
  comparison_df = pd.DataFrame(final_list, columns=comparison_columns)


In [71]:
y_train.value_counts().to_dict()

{0: 5551, 1: 2651}

In [72]:
# 2 = numebr of classes 
weights = len(X_train)/(2*(y_train.value_counts().to_dict()[0])), len(X_train)/(2*(y_train.value_counts().to_dict()[1]))
weights

(0.7387858043595749, 1.5469634100339495)

In [73]:
# Define model
log_reg = LogisticRegression(random_state=13, class_weight={0:weights[0], 1:weights[1]})
# fit it
log_reg.fit(X_train,y_train)

In [74]:
logistic_results = evaluate_models("Logistic Regression", log_reg, X_train, y_train, X_valid, y_valid, X_test, y_test)
add_dic_to_final_df(logistic_results)

Train Results
F1 Score: 0.5748006379585326
Recall Score: 0.6797434930215013
Confusion Matrix: 
[[3734 1817]
 [ 849 1802]]
Area Under Curve: 0.6762075418629394
 
Validation Results
F1 Score: 0.6087408949011446
Recall Score: 0.6956004756242569
Confusion Matrix: 
[[999 496]
 [256 585]]
Area Under Curve: 0.6819139501867103
 
Test Results
F1 Score: 0.7251675807434491
Recall Score: 0.7428214731585518
Confusion Matrix: 
[[720 245]
 [206 595]]
Area Under Curve: 0.7444677313979288


In [79]:
import joblib

w = {0: weights[0], 1: weights[1]}

def train_logistic_model(X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test, y_test=y_test):
    features = X_train.columns

    with wandb.init(project=PROJECT_NAME) as run:
        config = wandb.config
        params= {"random_state":13,
    "class_weight":w}

        model = LogisticRegression(**params)

        model.fit(X_train, y_train)
        
        # train predictions
        y_train_pred = model.predict(X_train)
        # train performance
        train_f1_score = f1_score(y_train, y_train_pred)


        # validation predictions
        y_valid_pred = model.predict(X_valid)
        # validation performance
        valid_f1_score = f1_score(y_valid, y_valid_pred)

        
        # test predictions
        y_preds = model.predict(X_test)
        y_probas = model.predict_proba(X_test)

        score = f1_score(y_test, y_preds)
        print(f"F1_score Train: {round(train_f1_score, 4)}")
        print(f"F1_score Valid: {round(valid_f1_score, 4)}")
        print(f"F1_score Test: {round(score, 4)}")


        wandb.log({"f1_score_train": train_f1_score})
        wandb.log({"f1_score_valid": valid_f1_score})
        wandb.log({"f1_score": score})

        wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test,
                                            y_preds, y_probas, labels= None, model_name='LogisticRegression', feature_names=features)

        model_artifact = wandb.Artifact(
                    "LogisticRegression", type="model",metadata=dict(config))

        joblib.dump(model, "log-truck-model.pkl")
        model_artifact.add_file("log-truck-model.pkl")
        wandb.save("log-truck-model.pkl")
        run.log_artifact(model_artifact)

In [80]:
train_logistic_model(X_train, y_train,X_valid, y_valid, X_test, y_test)




F1_score Train: 0.5748
F1_score Valid: 0.6087
F1_score Test: 0.7252


wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged confusion matrix.





wandb: Logged summary metrics.
wandb: Logged class proportions.





wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision-recall curve.
Traceback (most recent call last):
  File "C:\Users\harsh\AppData\Local\Temp\ipykernel_29052\1851004068.py", line 51, in train_logistic_model
    wandb.save("log-truck-model.pkl")
  File "c:\Users\harsh\anaconda3\envs\truck-delay\lib\site-packages\wandb\sdk\wandb_run.py", line 371, in wrapper_fn
    return func(self, *args, **kwargs)
  File "c:\Users\harsh\anaconda3\envs\truck-delay\lib\site-packages\wandb\sdk\wandb_run.py", line 361, in wrapper
    return func(self, *args, **kwargs)
  File "c:\Users\harsh\anaconda3\envs\truck-delay\lib\site-packages\wandb\sdk\wandb_run.py", line 1852, in save
    return self._save(glob_str, base_path, policy)
  File "c:\Users\harsh\anaconda3\envs\truck-delay\lib\site-packages\wandb\sdk\wandb_run.py", line 1906, in _save
    os.symlink(abs_path, wandb_path)
OSError: [WinError 1314] A required privilege is not held by the client: 'e:\\Artificial Intelligence\




VBox(children=(Label(value='0.070 MB of 0.070 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))




0,1
f1_score,▁
f1_score_train,▁
f1_score_valid,▁

0,1
f1_score,0.72517
f1_score_train,0.5748
f1_score_valid,0.60874


OSError: [WinError 1314] A required privilege is not held by the client: 'e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\log-truck-model.pkl' -> 'e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\wandb\\run-20240802_185736-o16ci57v\\files\\log-truck-model.pkl'

In [78]:
#from pathlib import Path

#Path('e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\log-truck-model.pkl').symlink_to('e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\wandb\\run-20240802_184218-57eks6hd\\files\\log-truck-model.pkl')

# https://github.com/wandb/wandb/issues/1370
# 

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\wandb\\run-20240802_184218-57eks6hd\\files\\log-truck-model.pkl' -> 'e:\\Artificial Intelligence\\Projects\\Predictive-Truck-Delay-Management-in-Logistics\\log-truck-model.pkl'