### Imports

In [20]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import SMOTE
import pickle

import warnings 
warnings.filterwarnings('ignore')

# Random seed
np.random.seed(42)

In [2]:
!unzip datasets.zip

Archive:  datasets.zip
  inflating: model_dataset.csv       
  inflating: resampled_df.csv        


In [3]:
df = pd.read_csv('model_dataset.csv')

# We move the target variable to the front, for simplicity
df.insert(0, "accident_severity", df.pop("accident_severity"))

df.head()

Unnamed: 0,accident_severity,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,Hospitalized wounded,14,Full day,Out of intersection,Cloudy weather,By the side,In built-up areas,Driver,Male,not specified,Departmental Road,Bidirectional,Dish,Curved right,normal,On the road
1,Hospitalized wounded,18,Full day,In intersection,Normal,Other,In built-up areas,Passenger,Female,not specified,Departmental Road,One way,Dish,Curved left,normal,On the road
2,Hospitalized wounded,19,Full day,Out of intersection,Normal,Other,Out of agglomeration,Pedestrian,Male,Opposite direction of the vehicle,Departmental Road,Bidirectional,Dish,Curved right,not normal,Off the road
3,Hospitalized wounded,19,Twilight or dawn,Out of intersection,Dazzling weather,By the side,In built-up areas,Driver,Male,not specified,Communal Way,Bidirectional,Dish,Straight part,normal,On the road
4,Hospitalized wounded,11,Full day,In intersection,Normal,By the side,In built-up areas,Passenger,Female,not specified,Communal Way,Bidirectional,Dish,Straight part,normal,On the road


### Encoding the features
Ideally we would use an nomical encoding technique like one-hot encoding, to avoid misleading our model. But given the amount of features and unique values, one-hot encoding might be more detrimental than beneficial, in terms of memory and computional power consumption 

In [None]:
le = LabelEncoder()

for column in df.columns:
    df[column] = le.fit_transform(df[column])
    
df.head()

Unnamed: 0,accident_severity,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,0,14,0,1,0,0,0,0,1,7,1,0,0,1,0,1
1,0,18,0,0,5,4,0,1,0,7,1,1,0,0,0,1
2,0,19,0,1,5,4,1,2,1,3,1,0,0,1,1,0
3,0,19,4,1,1,0,0,0,1,7,0,0,0,3,0,1
4,0,11,0,0,5,0,0,1,0,7,0,0,0,3,0,1


### Scaling and Train Test split

In [None]:
X = df.drop(['accident_severity'], axis=1) 
Y = df['accident_severity']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,0.092066,-0.63354,0.636007,-4.362589,-1.374964,-0.463834,-0.618895,0.713196,0.43657,0.290857,-0.620199,-0.443015,-1.678986,-0.48746,0.314232
1,0.830415,-0.63354,-1.572309,0.331258,0.53174,-0.463834,0.645396,-1.40214,0.43657,0.290857,0.742234,-0.443015,-2.73693,-0.48746,0.314232
2,1.015003,-0.63354,0.636007,0.331258,0.53174,2.155946,1.909688,0.713196,-1.214558,0.290857,-0.620199,-0.443015,-1.678986,2.051452,-3.18212
3,1.015003,2.439367,0.636007,-3.42382,-1.374964,-0.463834,-0.618895,0.713196,0.43657,-0.775021,-0.620199,-0.443015,0.436903,-0.48746,0.314232
4,-0.461696,-0.63354,-1.572309,0.331258,-1.374964,-0.463834,0.645396,-1.40214,0.43657,-0.775021,-0.620199,-0.443015,0.436903,-0.48746,0.314232


In [None]:
Y.value_counts()

2    178436
0    129119
1     19713
Name: accident_severity, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

### Modelling (initial)
We try out several well known models with their default hyperparameters<br>
Our key metrics are the
-  **macro** average F1-score 
-  recall for class 1(severity=killed)

In [13]:
# !! IMPORTANT !!
# Double check the train and test dataset that's in memory before using the functions below 

def run_model_reports(model):
  """Fits model, makes prediction, and evaluates the result"""

  name = type(model).__name__
    
  # Fit the model
  print(f"Fitting {name} model...")
  model.fit(x_train, y_train)
        
  # Make predictions
  print("Making predictions...")
  y_pred = model.predict(x_test)
  
  # Evaluate metrics
  report = classification_report(y_test, y_pred)
  # The dictionary format is neccesary for extracting our key metrics
  report_dict = classification_report(y_test, y_pred, output_dict=True)

  return report, report_dict



def get_key_metrics(report_dict):
  """Extracts key metrics from the report"""

  report_df =  pd.DataFrame(report_dict)

  class_1_recall = report_df['1'].loc['recall']
  macro_f1 = report_df['macro avg'].loc['f1-score']

  return class_1_recall, macro_f1

In [None]:
models = [LogisticRegression(n_jobs=-1),
          DecisionTreeClassifier(),
          RandomForestClassifier(n_jobs=-1),
          GradientBoostingClassifier(),
          AdaBoostClassifier(),
          XGBClassifier(n_jobs=-1),
          LGBMClassifier()]

In [None]:
for m in models:
    report, report_dict = run_model_reports(m)
    recall, f1 = get_key_metrics(report_dict)
    
    print(report)
    print(f'Class 1 Recall: {round(recall, 4)}')
    print(f'Macro F1-Score: {round(f1, 4)}')
    print('-----------------------------------------------------')

Fitting LogisticRegression model...
Making predictions...
              precision    recall  f1-score   support

           0       0.64      0.40      0.49     38736
           1       0.00      0.00      0.00      5914
           2       0.66      0.91      0.76     53531

    accuracy                           0.65     98181
   macro avg       0.43      0.44      0.42     98181
weighted avg       0.61      0.65      0.61     98181

Class 1 Recall: 0.0
Macro F1-Score: 0.4181
-----------------------------------------------------
Fitting DecisionTreeClassifier model...
Making predictions...
              precision    recall  f1-score   support

           0       0.53      0.49      0.51     38736
           1       0.17      0.11      0.13      5914
           2       0.68      0.74      0.71     53531

    accuracy                           0.60     98181
   macro avg       0.46      0.45      0.45     98181
weighted avg       0.59      0.60      0.60     98181

Class 1 Recall: 0.111

### Resampling
From the evaluation above, the decision tree model had the best recall for class 1 (score of 0.11), and most of the models achieved a macro average F1-score of 0.45.<br>
These are poor metrics, and are due to the imbalanced dataset, thus we proceeded to upsample our minority classes: 1(killed) and class 0(hospitalized)

In [None]:
sm = SMOTE(sampling_strategy='all', n_jobs=-1)
resampled_X, resampled_Y = sm.fit_resample(X, Y)

# Check the new class distribution
resampled_Y.value_counts()

0    471695
2    471695
1    471695
Name: accident_severity, dtype: int64

In [None]:
resampled_df = pd.concat([resampled_Y, resampled_X], axis=1)
# Shuffle the dataset
resampled_df = resampled_df.sample(frac=1).reset_index(drop=True)

To avoid re-running the SMOTE resample function

In [None]:
#resampled_df.to_csv('resampled_df.csv', index=False)

In [4]:
resampled_df = pd.read_csv('resampled_df.csv')
resampled_df.head()

Unnamed: 0,accident_severity,hour,lighting,intersection,atmosphere,collision,localisation,user_category,user_sex,pedestrian_action,road_category,traff_regime,longitud_profile,drawing_plan,surface_cond,acc_situation
0,1,0.812896,-0.631485,0.632882,-0.615857,0.071876,2.230864,-0.603092,0.721062,0.42591,0.261653,-0.615365,-0.435524,0.432162,2.064336,0.309949
1,0,0.451043,-0.631485,0.632882,0.316759,-1.338037,2.230864,-0.603092,0.721062,0.42591,2.284975,2.113628,-0.435524,0.432162,-0.484417,0.309949
2,1,-0.472985,-0.631485,-1.580073,0.316759,-1.338037,-0.448257,-0.603092,-1.386843,0.42591,0.261653,-0.615365,-0.435524,0.432162,-0.484417,0.309949
3,1,-0.272351,-0.631485,0.632882,0.316759,-0.398095,2.230864,-0.603092,0.721062,0.42591,2.284975,-0.615365,2.426502,-2.770194,-0.484417,0.309949
4,1,-1.014696,0.091278,-1.580073,0.316759,0.979507,-0.448257,-0.005625,-1.386843,0.42591,0.261653,0.019977,-0.435524,-2.770194,-0.484417,-3.226341


### Train test split on the new resampled dataset

In [5]:
resampled_X = resampled_df.drop(['accident_severity'], axis=1) 
resampled_Y = resampled_df['accident_severity']

x_train, x_test, y_train, y_test = train_test_split(resampled_X, resampled_Y, test_size=0.3)

### Modelling (second)

In [None]:
for m in models:
    report, report_dict = run_model_reports(m)
    recall, f1 = get_key_metrics(report_dict)
    
    print(report)
    print(f'Class 1 Recall: {round(recall, 4)}')
    print(f'Macro F1-Score: {round(f1, 4)}')
    print('-----------------------------------------------------')

Fitting LogisticRegression model...
Making predictions...
              precision    recall  f1-score   support

           0       0.40      0.17      0.24    141964
           1       0.58      0.62      0.60    140965
           2       0.52      0.79      0.63    141597

    accuracy                           0.53    424526
   macro avg       0.50      0.53      0.49    424526
weighted avg       0.50      0.53      0.49    424526

Class 1 Recall: 0.6172
Macro F1-Score: 0.4898
-----------------------------------------------------
Fitting DecisionTreeClassifier model...
Making predictions...
              precision    recall  f1-score   support

           0       0.53      0.46      0.49    141964
           1       0.78      0.79      0.79    140965
           2       0.62      0.71      0.66    141597

    accuracy                           0.65    424526
   macro avg       0.65      0.65      0.65    424526
weighted avg       0.65      0.65      0.65    424526

Class 1 Recall: 0.

We can see a great inprovement in our metrics after resampling<br>
Our best model so far is the Random Forest model (Class 1 Recall: 0.8247
Macro F1-Score: 0.6698)<br>
However we were still not satisfied with it's performance, our next approach was to select our top 3 models and find the best results with their best hyperparamters.<br>
This time our key metrics is to improve **accuracy**

### Hyperparameter tunning
The best performing models were trained using a random search CV

In [19]:
def randomsearch_cv(model, grid, cv, n_iter):
  """Performs the random search and returns the best model"""

  rs_cv = RandomizedSearchCV(estimator=model, param_distributions=grid, cv=cv,
                             n_iter=n_iter, scoring="accuracy", n_jobs=-1, verbose=1)
  
  rs_cv.fit(x_train, y_train)
  
  print(f'Best hyperparameters: {rs_cv.best_params_}')

  return rs_cv.best_estimator_

- Random Forest

In [None]:
# Define model and hyperparameters
rfc = RandomForestClassifier(n_jobs=-1)
rfc_grid = {'n_estimators': [50, 100, 200, 500, 1000],
            'min_samples_split': [2, 5, 7, 10],
            'min_samples_leaf': [2, 5, 7, 10],
            'max_features': ['auto', 'sqrt', 'log2', 'none']}

In [None]:
# Fit the model with the best hyperparameters
rfc_best = randomsearch_cv(rfc, rfc_grid, 1, 2)

rfc_best.fit(x_train, y_train)
y_pred_rfc = rfc_best.predict(x_test)

In [None]:
# Evaluate the model
report_dict = classification_report(y_test, y_pred_rfc, output_dict=True)

print(f"Accuracy: {accuracy_score(y_test, y_pred_rfc)}")
print(get_key_metrics(report_dict))

- Decision Tree

In [None]:
models[1].tree_.max_depth

In [21]:
# Define model and hyperparameters
dtc = DecisionTreeClassifier()
dtc_grid = {'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2', 'none'], 
            'max_depth': [25, 40, None]}

In [22]:
# Fit the model with the best hyperparameters
dtc_best = randomsearch_cv(dtc, dtc_grid, 5, 10)

dtc_best.fit(x_train, y_train)
y_pred_dtc = dtc_best.predict(x_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters: {'splitter': 'best', 'max_features': 'auto', 'max_depth': None, 'criterion': 'entropy'}


In [23]:
# Evaluate the model
report = classification_report(y_test, y_pred_dtc)
print(report)

              precision    recall  f1-score   support

           0       0.54      0.47      0.50    141619
           1       0.79      0.79      0.79    141298
           2       0.62      0.70      0.66    141609

    accuracy                           0.65    424526
   macro avg       0.65      0.65      0.65    424526
weighted avg       0.65      0.65      0.65    424526



- LightGBM

In [24]:
# Define model and hyperparameters
lgbmc = LGBMClassifier()
lgbmc_grid = {'num_leaves': [6, 10, 20, 30, 50],
              'min_child_samples': [100, 200, 300, 500],
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
# Fit the model with the best hyperparameters
lgbmc_best = randomsearch_cv(lgbmc, lgbmc_grid, 3, 3)

lgbmc_best.fit(x_train, y_train)
y_pred_lgbmc = lgbmc_best.predict(x_test)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [None]:
# Evaluate the model
report = classification_report(y_test, y_pred_lgbmc)
print(report)

### Voting Classifier
To properly utilize each model's strenght, and decrease the overall error, a voting classifier was used as our final model choice

In [None]:
voting_clf = VotingClassifier(estimators=[('rfc', rfc_best), ('dtc', dtc_best), ('lgbmc', lgbmc_best)],
                              voting='hard')

voting_clf.fit(x_train, y_train)

In [None]:
y_pred = voting_clf.predict(x_test)
report = classification_report(y_test, y_pred)
print(report)

### Save model to pickel file

In [None]:
with open('model.pkl', 'wb') as file:
  pickle.dump(voting_clf, file)