# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
# Import classifiers from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
# Import preprocessing tools
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Import classification metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, f1_score, roc_auc_score, matthews_corrcoef
# Presets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
np.random.seed(100)

# Steps in ML Modeling
1. Load Data and Minor Processing
2. Data Preprocessing
3. ML Modeling
4. Model Performance Metrics

In [3]:
DF = pd.read_csv('data_for_ml.csv')
DF.head(2)

Unnamed: 0,INDEX_,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,3366652,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major
1,3370334,19,weekday,Rain,Dark,Wet,4.8,1.0,Midblock,Traffic Signal including Transit,Major Arterial,60,351.0,63.0,Residential,452,205,183,Over 65,Normal,Crossing without ROW,automobile,speeding,Major


In [4]:
df = DF.drop('INDEX_',axis=1)
df.head(1)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,Major


In [5]:
# Integer encode the INJURY column
injury_map = {
    'Fatal' : 4,
    'Major' : 3,
    'Minor' : 2,
    'Minimal' : 1,
    np.nan : 0
}
df['INJURY'] = df['INJURY'].replace(injury_map).astype(int)
df.head(1)

Unnamed: 0,HOUR,DAYOFWEEK,VISIBILITY,LIGHT,RDSFCOND,TEMP,REL_HUMID,LOCCOORD,TRAFFCTL,ROADCLASS,SPEEDLMT,VEH_ADT,PED_ADT,LAND_USE,POP_2021,PRIV_DWELL,LAND_AREA,INVAGE,PEDCOND,PEDACT,VEHINV,VIOL,INJURY
0,7,weekday,Other,Dark,Wet,1.5,0.99,Midblock,No Control,Major Arterial,60,498.0,219.0,Mixed Use,504,263,225,45 to 64,Distracted,Crossing without ROW,automobile,speeding,3


# Data Preprocessing

In [6]:
# Split into X and y variables
X = df.iloc[:, :-1].values
y = df.iloc[:,-1].values

In [7]:
# One hot encode the categorical columns
cat_columns_index = [1, 2, 3, 4, 7, 8, 9, 13, 17, 18, 19, 20, 21]
encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[:, cat_columns_index])

In [8]:
# Combine the categorical and numerical columns
num_columns_index = [0, 5, 6, 10, 11, 12, 14, 15, 16]
X_encoded = np.concatenate((X_cat, X[:, num_columns_index]), axis=1)
print(X_encoded)

[[1.0 0.0 0.0 ... 504 263 225]
 [1.0 0.0 0.0 ... 452 205 183]
 [1.0 0.0 1.0 ... 807 375 322]
 ...
 [1.0 0.0 0.0 ... 482 252 231]
 [0.0 1.0 1.0 ... 1981 1195 1112]
 [1.0 0.0 1.0 ... 1170 482 467]]


In [9]:
# Split encoded X and y to train test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.20, random_state=100)

In [10]:
# Scale the numerical features
scaler = StandardScaler()
X_train[:, -len(num_columns_index):] = scaler.fit_transform(X_train[:, -len(num_columns_index):])
X_test[:, -len(num_columns_index):] = scaler.transform(X_test[:, -len(num_columns_index):])
print(X_train)
print(X_test)

[[1.0 0.0 1.0 ... -0.5867732574402085 -0.5265834440260664
  -0.5226000650737648]
 [1.0 0.0 1.0 ... 2.0284752641268318 1.433142186594836 1.5585892351770785]
 [1.0 0.0 1.0 ... 1.7854821802357512 2.0608224248081974
  2.0504771899165526]
 ...
 [1.0 0.0 0.0 ... -0.6056832250581914 -0.4655195294487484
  -0.4803918577363841]
 [1.0 0.0 0.0 ... -0.32297920916934647 -0.45841907426533934
  -0.4511707911181976]
 [0.0 1.0 1.0 ... -0.44400300192443726 -0.049432855700977114
  -0.03395889551408934]]
[[1.0 0.0 1.0 ... -0.6151382088671828 -0.5336838992094755
  -0.5339638132030595]
 [0.0 1.0 1.0 ... -0.7739819368582396 -0.6714327297676114
  -0.6833159314737909]
 [1.0 0.0 1.0 ... -0.37781811526149695 -0.3717935210277487
  -0.36350759126363785]
 ...
 [1.0 0.0 1.0 ... -0.27570429012438913 0.1025168852239769
  0.03260020067178006]
 [0.0 1.0 1.0 ... -0.6208111991525778 -0.4413779818251576
  -0.4609111466575931]
 [1.0 0.0 1.0 ... -0.7248160210514839 -0.5919076317134297
  -0.6005229093889289]]


# ML Modeling and Metrics

In [11]:
models = [KNeighborsClassifier(), RandomForestClassifier(), XGBClassifier()]

In [12]:
# Define parameter grids
param_grid_knn = {
    'n_neighbors' : [i for i in range(1,36)],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],                
    'max_features': ['auto', 'sqrt', 'log2'],      
    'max_depth': [None, 10, 20, 30],               
    'min_samples_split': [2, 5, 10],               
    'min_samples_leaf': [1, 2, 4],                 
    'bootstrap': [True, False],                    
    'criterion': ['gini', 'entropy']               
}

param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

param_grids = [param_grid_knn, param_grid_rf, param_grid_xgb]

In [13]:
# Iterate on both grids and models to fit the model with our training data
for model, grid in zip(models, param_grids):
    random_search = RandomizedSearchCV(estimator=model,
                                      param_distributions=grid,
                                      n_iter=100,
                                      cv=5,
                                      scoring='accuracy',
                                      random_state=100)
    random_search.fit(X_train, y_train)
    training_accuracy = random_search.score(X_train, y_train)
    best_parameters = random_search.best_params_
    print(f'{model} Modeling Results:\n')
    print('Best Parameters:')
    print(f'{best_parameters}\n')
    # Performance metrics
    y_pred = random_search.predict(X_test)
    y_pred_proba = random_search.predict_proba(X_test)
    # Print classification report
    print('Classification Report:')
    print(f'{classification_report(y_test, y_pred)}\n')
    # Print remaining metrics
    print(f"Training Accuracy: {training_accuracy:.2f}\n")

KNeighborsClassifier() Modeling Results:

Best Parameters:
{'weights': 'uniform', 'n_neighbors': 12, 'metric': 'manhattan'}

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        20
           3       0.79      0.98      0.87       488
           4       0.38      0.06      0.11        95

    accuracy                           0.78       623
   macro avg       0.23      0.21      0.20       623
weighted avg       0.68      0.78      0.70       623


Training Accuracy: 0.78

RandomForestClassifier() Modeling Results:

Best Parameters:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00 