In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
# Import classifiers from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
# Import preprocessing tools
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Import classification metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, f1_score, roc_auc_score, matthews_corrcoef
# Presets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')
%matplotlib inline

# Steps in Performing ML Modeling
1. Loading Data and Minor Adjustments
2. Data Preprocessing
3. ML Modeling
4. Model Performance Metrics

# 1 Import Data

In [2]:
DF = pd.read_csv('data_for_ml.csv')
DF.head(2)

Unnamed: 0,INDEX_,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,3366652,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major
1,3370334,19,weekday,Rain,Dark,Wet,4.8,1.0,Midblock,Traffic Signal including Transit,Major Arterial,60,351.0,63.0,Residential,452,205,183,Over 65,Normal,Crossing without ROW,automobile,speeding,Major


In [3]:
df = DF.drop('INDEX_',axis=1)
df.head(1)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major


In [4]:
# Map the target variable (INJURY) to the following dictionary below:
injury_map = {
    'Fatal' : 4,
    'Major' : 3,
    'Minor' : 2,
    'Minimal' : 1,
    np.nan : 0
}
df['INJURY'] = df['INJURY'].replace(injury_map)
df['INJURY'] = df['INJURY'].astype('int64')
df.head(1)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3113 entries, 0 to 3112
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   HOUR        3113 non-null   int64  
 1   DAYOFWEEK   3113 non-null   object 
 2   VISIBILITY  3113 non-null   object 
 3   LIGHT       3113 non-null   object 
 4   RDSFCOND    3113 non-null   object 
 5   TEMP        3113 non-null   float64
 6   REL_HUMID   3113 non-null   float64
 7   LOCCOORD    3113 non-null   object 
 8   TRAFFCTL    3113 non-null   object 
 9   ROADCLASS   3113 non-null   object 
 10  SPEEDLMT    3113 non-null   int64  
 11  VEH_ADT     3113 non-null   float64
 12  PED_ADT     3113 non-null   float64
 13  LAND_USE    3113 non-null   object 
 14  POP_2021    3113 non-null   int64  
 15  PRIV_DWELL  3113 non-null   int64  
 16  LAND_AREA   3113 non-null   int64  
 17  INVAGE      3113 non-null   object 
 18  PEDCOND     3113 non-null   object 
 19  PEDACT      3113 non-null  

# 2 Data Preprocessing

In [6]:
# Split into X and y variables
X = df.iloc[:, :-1].values
y = df.iloc[:,-1].values

In [7]:
# One hot encode the categorical columns
cat_columns_index = [1, 2, 3, 4, 7, 8, 9, 13, 17, 18, 19, 20, 21]
encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[:, cat_columns_index])

In [8]:
# Combine the categorical and numerical columns
num_columns_index = [0, 5, 6, 10, 11, 12, 14, 15, 16]
X_encoded = np.concatenate((X_cat, X[:, num_columns_index].astype(float)), axis=1)
print(X_encoded)

[[1.000e+00 0.000e+00 0.000e+00 ... 5.040e+02 2.630e+02 2.250e+02]
 [1.000e+00 0.000e+00 0.000e+00 ... 4.520e+02 2.050e+02 1.830e+02]
 [1.000e+00 0.000e+00 1.000e+00 ... 8.070e+02 3.750e+02 3.220e+02]
 ...
 [1.000e+00 0.000e+00 0.000e+00 ... 4.820e+02 2.520e+02 2.310e+02]
 [0.000e+00 1.000e+00 1.000e+00 ... 1.981e+03 1.195e+03 1.112e+03]
 [1.000e+00 0.000e+00 1.000e+00 ... 1.170e+03 4.820e+02 4.670e+02]]


In [9]:
# Split encoded X and y to train test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.20, random_state=100)

In [10]:
# Scale the numerical features
scaler = StandardScaler()
X_train[:, -len(num_columns_index):] = scaler.fit_transform(X_train[:, -len(num_columns_index):])
X_test[:, -len(num_columns_index):] = scaler.transform(X_test[:, -len(num_columns_index):])

# 3 ML Modeling
We will be using three (3) machine learning models: XGBoost (eXtreme Gradient Boosting), RF (Random Forest Classifier), and KNN (K Nearest Neighbors). We will also perform hyperparameter using GridSearchCV for each model. 

## 3.1. KNN

In [11]:
# Instantiate KNN object
knn = KNeighborsClassifier()

In [12]:
# Define a parameter grid for tuning
param_grid_knn = {
    'n_neighbors' : [i for i in range(1,36)],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

In [13]:
# Define grid search object for KNN
random_search_knn = RandomizedSearchCV(knn, param_grid_knn, n_iter=100, cv=5, scoring='accuracy')

In [14]:
# Fit the model
random_search_knn.fit(X_train, y_train)

In [15]:
# Print the classification report
y_ped_knn = random_search_knn.predict(X_test)
print(classification_report(y_test, y_ped_knn))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        20
           3       0.79      0.98      0.87       488
           4       0.38      0.06      0.11        95

    accuracy                           0.78       623
   macro avg       0.23      0.21      0.20       623
weighted avg       0.68      0.78      0.70       623



In [16]:
# Print other metrics
y_ped_knn_proba = random_search_knn.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_knn))
print(roc_auc_score(y_test, y_ped_knn_proba, multi_class='ovr'))

0.08464133392095374
0.5764442284899054


## 3.2. Random Forest

In [17]:
# Instantiate RF object
rf = RandomForestClassifier()

In [18]:
# Define parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],                # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],      # Number of features to consider at each split
    'max_depth': [None, 10, 20, 30],               # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],               # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                 # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],                     # Whether bootstrap samples are used when building trees
    'criterion': ['gini', 'entropy']               # Function to measure the quality of a split
}

In [19]:
# Define grid search object for RF
random_search_rf = RandomizedSearchCV(rf, param_grid_rf, n_iter=100, cv=5, scoring='accuracy')

In [20]:
# Train the model
random_search_rf.fit(X_train, y_train)

In [21]:
# Print Classification Report
y_ped_rf = random_search_rf.predict(X_test)
print(classification_report(y_test, y_ped_rf))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        20
           3       0.79      0.99      0.88       488
           4       0.57      0.04      0.08        95

    accuracy                           0.78       623
   macro avg       0.27      0.21      0.19       623
weighted avg       0.70      0.78      0.70       623



In [22]:
# Print other metrics
y_ped_rf_proba = random_search_rf.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_rf))
print(roc_auc_score(y_test, y_ped_rf_proba, multi_class='ovr'))

0.09698811897928299
0.6693531370853061


In [23]:
# Best parameters
random_search_rf.best_params_

{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'criterion': 'gini',
 'bootstrap': True}

## 3.3. XGBoost

In [24]:
# Instatiate the XGB object
xgb = XGBClassifier()

In [25]:
# Define parameter grid
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

In [26]:
# Define random search for XGB
random_search_xgb = RandomizedSearchCV(xgb, param_grid_xgb, n_iter=100, cv=5, scoring='accuracy')

In [27]:
# Train the model
random_search_xgb.fit(X_train, y_train)

In [28]:
# Print Classification Report
y_ped_xgb = random_search_rf.predict(X_test)
print(classification_report(y_test, y_ped_xgb))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        20
           3       0.79      0.99      0.88       488
           4       0.57      0.04      0.08        95

    accuracy                           0.78       623
   macro avg       0.27      0.21      0.19       623
weighted avg       0.70      0.78      0.70       623



In [29]:
# Print other metrics
y_ped_xgb_proba = random_search_xgb.predict_proba(X_test)
print(matthews_corrcoef(y_test, y_ped_xgb))
print(roc_auc_score(y_test, y_ped_xgb_proba, multi_class='ovr'))

0.09698811897928299
0.6761503307725427


In [30]:
# Best parameters
random_search_xgb.best_params_

{'subsample': 0.8,
 'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.1,
 'colsample_bytree': 0.8}

# Summaries

In [34]:
# Precise Scores
print(f'KNN Model Training Accuracy: {random_search_knn.score(X_train, y_train)*100}%')
print(f'KNN Model Testing Accuracy: {random_search_knn.score(X_test, y_test)*100}%\n')
print(f'RandomForest Model Training Accuracy: {random_search_rf.score(X_train, y_train)*100} %')
print(f'RandomForest Model Testing Accuracy: {random_search_rf.score(X_test, y_test)*100} %\n')
print(f'XGBoost Model Training Accuracy: {random_search_xgb.score(X_train, y_train)*100} %')
print(f'XGBoost Model Testing Accuracy: {random_search_xgb.score(X_test, y_test)*100} %')

KNN Model Training Accuracy: 78.47389558232932%
KNN Model Testing Accuracy: 77.84911717495987%

RandomForest Model Training Accuracy: 82.16867469879519 %
RandomForest Model Testing Accuracy: 78.49117174959872 %

XGBoost Model Training Accuracy: 81.88755020080322 %
XGBoost Model Testing Accuracy: 78.00963081861958 %
