In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
# Import classifiers from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
# Import preprocessing tools
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Import classification metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, f1_score, roc_auc_score, matthews_corrcoef
# Presets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
DF = pd.read_csv('data_for_ml.csv')
DF.head(2)

Unnamed: 0,INDEX_,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,3366652,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major
1,3370334,19,weekday,Rain,Dark,Wet,4.8,1.0,Midblock,Traffic Signal including Transit,Major Arterial,60,351.0,63.0,Residential,452,205,183,Over 65,Normal,Crossing without ROW,automobile,speeding,Major


In [3]:
df = DF.drop('INDEX_', axis=1)
df.head(1)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major


In [6]:
# Fill missing values
df['INJURY'] = df['INJURY'].fillna('None')

In [7]:
df['INJURY'].isna().sum()

0

In [8]:
# Check proportions of target variables
df['INJURY'].value_counts(normalize=True)

INJURY
Major      0.776743
Fatal      0.163508
Minor      0.035014
Minimal    0.015740
None       0.008995
Name: proportion, dtype: float64

In [9]:
df['INJURY'].value_counts()

INJURY
Major      2418
Fatal       509
Minor       109
Minimal      49
None         28
Name: count, dtype: int64

# Stratified Sampling

In [22]:
samples = []
for injury in df['INJURY'].unique():
    group_data = df[df['INJURY']==injury]
    sampled_group = group_data.sample(n=100, replace=True, random_state=101)
    samples.append(sampled_group)

In [25]:
stratified_df = pd.concat(samples)
new_df = stratified_df.reset_index().drop('index',axis=1)
new_df.head(2)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,16,weekday,Clear,Daylight,Dry,19.4,0.41,Intersection,Traffic Signal including Transit,Major Arterial,60,466.0,7.0,Parks,457,279,265,45 to 64,Unknown,Crossing without ROW,transit vehicle,speeding,Major
1,22,weekday,Clear,Dark,Dry,12.8,0.63,Midblock,No Control,Major Arterial,60,430.0,19.0,Unknown,1085,607,568,45 to 64,Impaired - Alcohol (BAC = Normal),Crossing with ROW and no control,automobile,speeding,Major


In [37]:
injury_map = {
    'Fatal' : 4,
    'Major' : 3,
    'Minor' : 2,
    'Minimal' : 1,
    'None' : 0
}

new_df['INJURY'] = new_df['INJURY'].replace(injury_map)
new_df.head(2)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,16,weekday,Clear,Daylight,Dry,19.4,0.41,Intersection,Traffic Signal including Transit,Major Arterial,60,466.0,7.0,Parks,457,279,265,45 to 64,Unknown,Crossing without ROW,transit vehicle,speeding,3
1,22,weekday,Clear,Dark,Dry,12.8,0.63,Midblock,No Control,Major Arterial,60,430.0,19.0,Unknown,1085,607,568,45 to 64,Impaired - Alcohol (BAC = Normal),Crossing with ROW and no control,automobile,speeding,3


# Data Preprocessing

In [38]:
X = new_df.iloc[:, :-1].values
y = new_df.iloc[:,-1].values

In [39]:
cat_columns_index = [1, 2, 3, 4, 7, 8, 9, 13, 17, 18, 19, 20, 21]
encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[:, cat_columns_index])

In [40]:
num_columns_index = [0, 5, 6, 10, 11, 12, 14, 15, 16]
X_encoded = np.concatenate((X_cat, X[:, num_columns_index].astype(float)), axis=1)
print(X_encoded)

[[1.000e+00 0.000e+00 1.000e+00 ... 4.570e+02 2.790e+02 2.650e+02]
 [1.000e+00 0.000e+00 1.000e+00 ... 1.085e+03 6.070e+02 5.680e+02]
 [0.000e+00 1.000e+00 1.000e+00 ... 4.040e+02 1.110e+02 1.040e+02]
 ...
 [1.000e+00 0.000e+00 0.000e+00 ... 4.210e+02 2.040e+02 1.890e+02]
 [0.000e+00 1.000e+00 1.000e+00 ... 9.980e+02 3.220e+02 3.100e+02]
 [1.000e+00 0.000e+00 1.000e+00 ... 2.830e+02 1.490e+02 1.240e+02]]


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.20, random_state=100)

In [42]:
scaler = StandardScaler()
X_train[:, -len(num_columns_index):] = scaler.fit_transform(X_train[:, -len(num_columns_index):])
X_test[:, -len(num_columns_index):] = scaler.transform(X_test[:, -len(num_columns_index):])

# ML Modeling

## KNN

In [43]:
# Instantiate KNN object
knn = KNeighborsClassifier()
# Define a parameter grid for tuning
param_grid_knn = {
    'n_neighbors' : [i for i in range(1,36)],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}
# Define grid search object for KNN
random_search_knn = RandomizedSearchCV(knn, param_grid_knn, n_iter=100, cv=5, scoring='accuracy')

In [44]:
# Fit the model
random_search_knn.fit(X_train, y_train)

In [45]:
# Print the classification report
y_ped_knn = random_search_knn.predict(X_test)
print(classification_report(y_test, y_ped_knn))

              precision    recall  f1-score   support

           0       0.73      1.00      0.84        19
           1       0.62      0.82      0.71        22
           2       0.50      0.36      0.42        14
           3       0.43      0.45      0.44        20
           4       0.79      0.44      0.56        25

    accuracy                           0.62       100
   macro avg       0.61      0.61      0.59       100
weighted avg       0.63      0.62      0.60       100



In [46]:
# Print other metrics
y_ped_knn_proba = random_search_knn.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_knn))
print(roc_auc_score(y_test, y_ped_knn_proba, multi_class='ovr'))

0.5303952677678889
0.7978172039201109


## Random Forest

In [47]:
# Instantiate RF object
rf = RandomForestClassifier()

# Define parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],                # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],      # Number of features to consider at each split
    'max_depth': [None, 10, 20, 30],               # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],               # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                 # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],                     # Whether bootstrap samples are used when building trees
    'criterion': ['gini', 'entropy']               # Function to measure the quality of a split
}

# Define grid search object for RF
random_search_rf = RandomizedSearchCV(rf, param_grid_rf, n_iter=100, cv=5, scoring='accuracy')

In [48]:
# Train the model
random_search_rf.fit(X_train, y_train)

In [49]:
# Print Classification Report
y_ped_rf = random_search_rf.predict(X_test)
print(classification_report(y_test, y_ped_rf))

              precision    recall  f1-score   support

           0       0.83      1.00      0.90        19
           1       0.80      0.73      0.76        22
           2       0.50      0.36      0.42        14
           3       0.38      0.45      0.41        20
           4       0.70      0.64      0.67        25

    accuracy                           0.65       100
   macro avg       0.64      0.63      0.63       100
weighted avg       0.65      0.65      0.65       100



In [50]:
# Print other metrics
y_ped_rf_proba = random_search_rf.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_rf))
print(roc_auc_score(y_test, y_ped_rf_proba, multi_class='ovr'))

0.5605115202236649
0.8696889317523977


In [51]:
# Best parameters
random_search_rf.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

## XGBoost

In [52]:
# Instatiate the XGB object
xgb = XGBClassifier()

# Define parameter grid
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

# Define random search for XGB
random_search_xgb = RandomizedSearchCV(xgb, param_grid_xgb, n_iter=100, cv=5, scoring='accuracy')

In [53]:
# Train the model
random_search_xgb.fit(X_train, y_train)

In [54]:
# Print Classification Report
y_ped_xgb = random_search_rf.predict(X_test)
print(classification_report(y_test, y_ped_xgb))

              precision    recall  f1-score   support

           0       0.83      1.00      0.90        19
           1       0.80      0.73      0.76        22
           2       0.50      0.36      0.42        14
           3       0.38      0.45      0.41        20
           4       0.70      0.64      0.67        25

    accuracy                           0.65       100
   macro avg       0.64      0.63      0.63       100
weighted avg       0.65      0.65      0.65       100



In [55]:
# Print other metrics
y_ped_xgb_proba = random_search_xgb.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_xgb))
print(roc_auc_score(y_test, y_ped_xgb_proba, multi_class='ovr'))

0.5605115202236649
0.8562284306382224


In [56]:
# Best parameters
random_search_xgb.best_params_

{'subsample': 0.8,
 'min_child_weight': 1,
 'max_depth': 5,
 'learning_rate': 0.3,
 'colsample_bytree': 1.0}

# Summaries

In [57]:
# Precise Scores
print(f'KNN Model Training Accuracy: {random_search_knn.score(X_train, y_train)*100}%')
print(f'KNN Model Testing Accuracy: {random_search_knn.score(X_test, y_test)*100}%\n')
print(f'RandomForest Model Training Accuracy: {random_search_rf.score(X_train, y_train)*100} %')
print(f'RandomForest Model Testing Accuracy: {random_search_rf.score(X_test, y_test)*100} %\n')
print(f'XGBoost Model Training Accuracy: {random_search_xgb.score(X_train, y_train)*100} %')
print(f'XGBoost Model Testing Accuracy: {random_search_xgb.score(X_test, y_test)*100} %')

KNN Model Training Accuracy: 98.25%
KNN Model Testing Accuracy: 62.0%

RandomForest Model Training Accuracy: 98.0 %
RandomForest Model Testing Accuracy: 65.0 %

XGBoost Model Training Accuracy: 98.25 %
XGBoost Model Testing Accuracy: 59.0 %
