### Imports

In [None]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE
#import pickle

import warnings 
warnings.filterwarnings('ignore')

In [None]:
!unzip model_dataset.zip

Archive:  model_dataset.zip
  inflating: model_dataset.csv       


In [None]:
df = pd.read_csv('model_dataset.csv')

# We move the target variable to the front, for simplicity
df.insert(0, "accident_severity", df.pop("accident_severity"))

df.head()

Unnamed: 0.1,accident_severity,Unnamed: 0,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,Hospitalized wounded,0,14,Full day,Out of intersection,Cloudy weather,By the side,In built-up areas,Driver,Male,not specified,Departmental Road,Bidirectional,Dish,Curved right,normal,On the road
1,Hospitalized wounded,1,18,Full day,In intersection,Normal,Other,In built-up areas,Passenger,Female,not specified,Departmental Road,One way,Dish,Curved left,normal,On the road
2,Hospitalized wounded,2,19,Full day,Out of intersection,Normal,Other,Out of agglomeration,Pedestrian,Male,Opposite direction of the vehicle,Departmental Road,Bidirectional,Dish,Curved right,not normal,Off the road
3,Hospitalized wounded,3,19,Twilight or dawn,Out of intersection,Dazzling weather,By the side,In built-up areas,Driver,Male,not specified,Communal Way,Bidirectional,Dish,Straight part,normal,On the road
4,Hospitalized wounded,4,11,Full day,In intersection,Normal,By the side,In built-up areas,Passenger,Female,not specified,Communal Way,Bidirectional,Dish,Straight part,normal,On the road


### Encoding the features
Ideally we would use an nomical encoding technique like one-hot encoding, to avoid misleading our model. But given the amount of features and unique values, one-hot encoding might be more detrimental than beneficial, in terms of memory and computional power consumption 

In [None]:
le = LabelEncoder()

for column in df.columns:
    df[column] = le.fit_transform(df[column])
    
df.head()

Unnamed: 0.1,accident_severity,Unnamed: 0,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,0,0,14,0,1,0,0,0,0,1,7,1,0,0,1,0,1
1,0,1,18,0,0,5,4,0,1,0,7,1,1,0,0,0,1
2,0,2,19,0,1,5,4,1,2,1,3,1,0,0,1,1,0
3,0,3,19,4,1,1,0,0,0,1,7,0,0,0,3,0,1
4,0,4,11,0,0,5,0,0,1,0,7,0,0,0,3,0,1


### Scaling and Train Test split

In [None]:
X = df.drop(['accident_severity'], axis=1) 
Y = df['accident_severity']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0.1,Unnamed: 0,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,-1.732049,0.081432,-0.631485,0.632882,-4.34632,-1.338037,-0.448257,-0.603092,0.721062,0.42591,0.261653,-0.615365,-0.435524,-1.702742,-0.484417,0.309949
1,-1.732045,0.820654,-0.631485,-1.580073,0.316759,0.541847,-0.448257,0.680063,-1.386843,0.42591,0.261653,0.749132,-0.435524,-2.770194,-0.484417,0.309949
2,-1.73204,1.00546,-0.631485,0.632882,0.316759,0.541847,2.230864,1.963217,0.721062,-1.251721,0.261653,-0.615365,-0.435524,-1.702742,2.064336,-3.226341
3,-1.732036,1.00546,2.47301,0.632882,-3.413704,-1.338037,-0.448257,-0.603092,0.721062,0.42591,-0.750008,-0.615365,-0.435524,0.432162,-0.484417,0.309949
4,-1.732032,-0.472985,-0.631485,-1.580073,0.316759,-1.338037,-0.448257,0.680063,-1.386843,0.42591,-0.750008,-0.615365,-0.435524,0.432162,-0.484417,0.309949


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

### Modelling (initial)
We try out several well known models with their default hyperparameters<br>
Our key metrics are the
-  **macro** average F1-score 
-  recall for class 1(severity=killed)

In [None]:
# !! IMPORTANT !!
# Double check the train and test dataset that's in memory before using the functions below 

def run_model_reports(model):
  """A helper function to avoid duplicating code"""

  name = type(model).__name__
    
  # Fit the model
  print(f"Fitting {name} model...")
  model.fit(x_train, y_train)
        
  # Make predictions
  print("Making predictions...")
  y_pred = model.predict(x_test)
  
  # Evaluate metrics
  report = classification_report(y_test, y_pred)
  # The dictionary format is neccesary for extracting our key metrics
  report_dict = classification_report(y_test, y_pred, output_dict=True)

  return report, report_dict



def get_key_metrics(report_dict):
  """A helper function to extract key metrics from the report"""

  report_df =  pd.DataFrame(report_dict)

  class_1_recall = report_df['1'].loc['recall']
  macro_f1 = report_df['macro avg'].loc['f1-score']

  return class_1_recall, macro_f1

In [None]:
models = [LogisticRegression(n_jobs=-1),
          DecisionTreeClassifier(),
          RandomForestClassifier(n_jobs=-1),
          GradientBoostingClassifier(),
          AdaBoostClassifier(),
          XGBClassifier(n_jobs=-1),
          LGBMClassifier()]

#SVC(),
#KNeighborsClassifier(n_jobs=-1),
# These models were removed because they take way too long to train

In [None]:
# for m in models[:1]:
#     report, report_dict = run_model_reports(m)
#     recall, f1 = get_key_metrics(report_dict)
    
#     print(report)
#     print(f'Class 1 Recall: {round(recall, 4)}')
#     print(f'Macro F1-Score: {round(f1, 4)}')
#     print('-----------------------------------------------------')

### Resampling
From the evaluation above, the decision tree model had the best recall for class 1 (score of 0.17), and most of the models achieved a macro average F1-score of 0.45.<br>
These are poor metrics, and are due to the imbalanced dataset, thus we proceeded to upsample our minority classes: 1(killed) and class 0(hospitalized)

In [None]:
sm = SMOTE(sampling_strategy='all', n_jobs=-1)
resampled_X, resampled_Y = sm.fit_resample(X, Y)

# Check the new class distribution
resampled_Y.value_counts()

0    471695
2    471695
1    471695
Name: accident_severity, dtype: int64

In [None]:
resampled_df = pd.concat([resampled_Y, resampled_X], axis=1)
# We shuffle the dataset
resampled_df = resampled_df.sample(frac=1).reset_index(drop=True)

To avoid re-running the SMOTE resample function

In [None]:
resampled_df.to_csv('resampled_df.csv', index=False)

In [None]:
resampled_df = pd.read_csv('resampled_df.csv')
resampled_df.head()

Unnamed: 0.1,accident_severity,Unnamed: 0,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,1,0.328891,-2.087182,1.696886,0.632882,0.316759,0.071876,2.230864,-0.603092,0.721062,0.42591,1.004789,-0.615365,-0.435524,0.432162,-0.484417,0.309949
1,0,1.701713,0.820654,1.696886,0.632882,-2.481088,1.48179,2.230864,-0.603092,0.721062,0.42591,0.261653,-0.615365,-0.435524,-1.702742,2.064336,0.309949
2,2,-0.259565,-0.472985,-0.631485,0.632882,0.316759,-1.338037,-0.448257,-0.603092,-1.386843,0.42591,-0.750008,-0.615365,-0.435524,0.432162,-0.484417,0.309949
3,1,0.350206,-1.866458,1.696886,0.632882,0.316759,0.541847,2.230864,0.680063,-1.386843,0.42591,0.261653,-0.615365,-0.435524,-1.991066,-0.484417,-3.226341
4,1,-0.455673,-0.74306,-0.631485,0.632882,0.316759,1.023911,-0.448257,-0.603092,0.721062,0.42591,0.261653,-0.615365,-0.435524,-1.702742,2.064336,0.309949


### Train test split on the new resampled dataset

In [None]:
resampled_X = resampled_df.drop(['accident_severity'], axis=1) 
resampled_Y = resampled_df['accident_severity']

x_train, x_test, y_train, y_test = train_test_split(resampled_X, resampled_Y, test_size=0.3)

### Modelling (second)

In [None]:
for m in models[:1]:
    report, report_dict = run_model_reports(m)
    recall, f1 = get_key_metrics(report_dict)
    
    print(report)
    print(f'Class 1 Recall: {round(recall, 4)}')
    print(f'Macro F1-Score: {round(f1, 4)}')
    print('-----------------------------------------------------')

Fitting LogisticRegression model...
Making predictions...
              precision    recall  f1-score   support

           0       0.39      0.17      0.24    141648
           1       0.58      0.62      0.60    141016
           2       0.52      0.79      0.63    141862

    accuracy                           0.52    424526
   macro avg       0.50      0.52      0.49    424526
weighted avg       0.50      0.52      0.49    424526

Class 1 Recall: 0.6153
Macro F1-Score: 0.4885
-----------------------------------------------------


### Voting Classifier
To properly utilize each model's strenght, and decrease the overall error, a voting classifier was used as our final model choice, this classifier consists of:
-  Random forrest

### Hyperparameter tunning

In [None]:
grid = {'max_features': ['auto', 'log2', 'none'],
                    'min_samples_leaf': [4, 6, 8],
                    'min_samples_split': [2, 5, 7],
                    'n_estimators': [100, 300, 500, 1000]}

In [None]:
rs_cv = RandomizedSearchCV(estimator=xgbc, param_distributions=grid, cv=5, n_iter=10, scoring="accuracy", n_jobs=-1, verbose=5)
rs_cv.fit(x_train, y_train)

print(f'Best hyperparameters: {rs_cv.best_params_}')    

rs_cv.best_estimator_.fit(x_train, y_train)
y_pred = rs_cv.best_estimator_.predict(x_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

NameError: ignored

### Save model to pickel file

In [None]:
with open('model.pkl', 'wb') as file:
  pickle.dump(lgbmc, file)