In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('../Machine_Learning/updated.csv')

df = df[df['County'] == 'Dallas']
df.drop(['County', 'Distance(mi)'], axis=1, inplace=True)

df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Street,Zipcode,Temperature(F),Humidity(%),Pressure(in),Wind_Direction,Wind_Speed(mph),...,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Station,Stop,Astronomical_Twilight,Accident_Duration
0,2,32.662193,-96.943153,local_roads,75249,60.1,24.0,30.0,,5.8,...,False,False,False,False,False,False,False,False,Day,74.933
1,3,32.77879,-96.782021,highway,75226,61.0,22.0,30.01,NW,4.6,...,False,False,False,False,False,False,False,False,Day,78.467
2,2,32.724277,-96.762245,highway,75215,61.0,22.0,30.01,NW,4.6,...,False,False,False,False,False,False,False,False,Day,76.233
3,2,32.708355,-96.700043,local_roads,75217,59.0,23.0,30.02,NW,10.4,...,False,False,False,False,False,False,False,False,Day,92.017
4,3,32.864021,-96.66114,highway,75228,61.0,22.0,30.01,NW,4.6,...,False,False,False,False,False,False,False,False,Day,74.633


In [2]:
df = df[(df['Severity'] == 2) | (df['Severity'] == 4)]
df['Severity'] = df['Severity'].replace([2, 4], [0, 1])

In [3]:
# Look at all the columns
df.columns

Index(['Severity', 'Start_Lat', 'Start_Lng', 'Street', 'Zipcode',
       'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station', 'Stop',
       'Astronomical_Twilight', 'Accident_Duration'],
      dtype='object')

In [4]:
# Get separate numerical and categorical categories
num_features = df.select_dtypes(include=['int64', 'float64']).copy()
num_features = num_features.drop(['Severity'], axis=1, inplace=False)
output = df['Severity'].copy()
cat_features = df.select_dtypes(include=['object', 'bool']).copy()

In [5]:
# Tuen zipcode into object for encoding
df = df.astype({'Zipcode':'object'})

In [6]:
need_impute = []
not_missing = []
missing = dict(df.isna().mean().round(5))

# Check if any features need imputation
for k, v in missing.items():
   if k == 'Severity':
      continue
   if v > 0:
      need_impute.append(k)
   else:
      not_missing.append(k)

print(f'Need Imputation: {need_impute}')
print(f'Not Missing: {not_missing}')

Need Imputation: ['Street', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Wind_Direction', 'Wind_Speed(mph)', 'Weather_Condition']
Not Missing: ['Start_Lat', 'Start_Lng', 'Zipcode', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station', 'Stop', 'Astronomical_Twilight', 'Accident_Duration']


In [7]:
# For Mutlivariate Imputation
impute_df = df[need_impute].copy()
impute_num = impute_df.select_dtypes(include=['int64', 'float64'])
impute_cat = impute_df.select_dtypes(include=['object', 'bool'])

print(f'Missing Numerical: {impute_num.columns}')
print(f'Missing Categorical: {impute_cat.columns}')
   

Missing Numerical: Index(['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Wind_Speed(mph)'], dtype='object')
Missing Categorical: Index(['Street', 'Wind_Direction', 'Weather_Condition'], dtype='object')


In [8]:
import copy 

cat_col_impute = copy.deepcopy(not_missing)
num_col_impute = copy.deepcopy(not_missing)


# Identify columns used for multivariate imputation
for n in impute_num.columns:
   num_col_impute.append(n)

for c in impute_cat.columns:
   cat_col_impute.append(c)

cat_col_impute.pop(2) # Get rid of zipcode

print(f'Categorical Impute: {cat_col_impute}')
print(f'Numerical Impute: {num_col_impute}')

Categorical Impute: ['Start_Lat', 'Start_Lng', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station', 'Stop', 'Astronomical_Twilight', 'Accident_Duration', 'Street', 'Wind_Direction', 'Weather_Condition']
Numerical Impute: ['Start_Lat', 'Start_Lng', 'Zipcode', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station', 'Stop', 'Astronomical_Twilight', 'Accident_Duration', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Wind_Speed(mph)']


In [9]:
# Encode binary features
df.loc[:, 'Amenity':'Stop'] = df.loc[:, 'Amenity':'Stop'].replace([True, False], [1, 0])
df['Astronomical_Twilight'] = df['Astronomical_Twilight'].replace(['Day', 'Night'], [1, 0])

In [10]:
from sklearn.model_selection import train_test_split

new_df = df.copy()
y = new_df['Severity']
new_df.drop(['Severity'], axis=1, inplace=True)
X = new_df

# Stratify split since data is highly imbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [11]:
from category_encoders.cat_boost import CatBoostEncoder

# Catboost Encoder Used
cbe = CatBoostEncoder()
X_train['Zipcode'] = cbe.fit_transform(X_train['Zipcode'], y_train)
X_test['Zipcode'] = cbe.transform(X_test['Zipcode'], y_test)

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Numerical Imputation
mf = IterativeImputer(
   initial_strategy='median',
   verbose=2,
   random_state=0)
   
mf.fit(X_train[num_col_impute])

[IterativeImputer] Completing matrix with shape (59269, 18)
[IterativeImputer] Ending imputation round 1/10, elapsed time 1.68
[IterativeImputer] Change: 26.180307130098385, scaled tolerance: 0.84355 
[IterativeImputer] Ending imputation round 2/10, elapsed time 3.17
[IterativeImputer] Change: 3.0344569898477403, scaled tolerance: 0.84355 
[IterativeImputer] Ending imputation round 3/10, elapsed time 4.67
[IterativeImputer] Change: 0.46444685556354415, scaled tolerance: 0.84355 
[IterativeImputer] Early stopping criterion reached.


In [13]:
# Impute numerical values
X_train[num_col_impute] = mf.transform(X_train[num_col_impute])
X_test[num_col_impute] = mf.transform(X_test[num_col_impute])

[IterativeImputer] Completing matrix with shape (59269, 18)
[IterativeImputer] Ending imputation round 1/3, elapsed time 0.02
[IterativeImputer] Ending imputation round 2/3, elapsed time 0.05
[IterativeImputer] Ending imputation round 3/3, elapsed time 0.07
[IterativeImputer] Completing matrix with shape (14818, 18)
[IterativeImputer] Ending imputation round 1/3, elapsed time 0.01
[IterativeImputer] Ending imputation round 2/3, elapsed time 0.01
[IterativeImputer] Ending imputation round 3/3, elapsed time 0.02


In [14]:
# Find correlation based on (parametric) pearson test
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
import warnings

np.warnings = warnings
boxCox = PowerTransformer(method='box-cox', standardize=True)
yeo = PowerTransformer(method='yeo-johnson', standardize=True)

neg_cols = []
pos_cols = []
for c in num_features.columns:
   if df.loc[:,c].min() > 0:  
      pos_cols.append(c)
   else:
      neg_cols.append(c)

print(pos_cols, neg_cols)

X_train[neg_cols] = yeo.fit_transform(X_train[neg_cols])
X_train[pos_cols] = boxCox.fit_transform(X_train[pos_cols])

X_test[neg_cols] = yeo.transform(X_test[neg_cols])
X_test[pos_cols] = boxCox.transform(X_test[pos_cols])

['Start_Lat', 'Zipcode', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Accident_Duration'] ['Start_Lng', 'Wind_Speed(mph)']


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [15]:
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder()
missing = ['Street', 'Wind_Direction', 'Weather_Condition']

# Label Encode everything for imputation
features_miss = {}
for m in missing:
   non_null_train = X_train[m].dropna().unique()
   le.fit(non_null_train)
   X_train[m] = X_train[m].apply(lambda x: le.transform([x])[0] if pd.notna(x) else np.nan)
   X_test[m] = X_test[m].apply(lambda x: le.transform([x])[0] if pd.notna(x) else np.nan)
   mapping = dict(zip(le.transform(le.classes_), le.classes_))
   features_miss[m] = mapping
   print(f'{m}: {mapping}')

Street: {0: 'highway', 1: 'local_roads', 2: 'other', 3: 'pikes'}
Wind_Direction: {0: 'Calm', 1: 'E', 2: 'N', 3: 'NE', 4: 'NW', 5: 'S', 6: 'SE', 7: 'SW', 8: 'W'}
Weather_Condition: {0: 'Clear', 1: 'Cloudy', 2: 'Fog', 3: 'Ice', 4: 'Rain', 5: 'Thunder', 6: 'Windy'}


In [16]:
# Change everything to integers
X_train[missing] = X_train[missing].astype(pd.Int64Dtype())
X_test[missing] = X_test[missing].astype(pd.Int64Dtype())

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Iterative Imputer using Decisison Tree Classifier
mf = IterativeImputer(estimator=DecisionTreeClassifier(),
                      initial_strategy='most_frequent',
                      verbose=2,
                      random_state=0)
print(cat_features.columns)
mf.fit(X_train[cat_features.columns])

Index(['Street', 'Wind_Direction', 'Weather_Condition', 'Amenity', 'Bump',
       'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station',
       'Stop', 'Astronomical_Twilight'],
      dtype='object')
[IterativeImputer] Completing matrix with shape (59269, 13)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.92
[IterativeImputer] Change: 6.0, scaled tolerance: 0.008 
[IterativeImputer] Ending imputation round 2/10, elapsed time 1.92
[IterativeImputer] Change: 0.0, scaled tolerance: 0.008 
[IterativeImputer] Early stopping criterion reached.


In [18]:
# Impute Categorical Features
X_train[cat_features.columns] = mf.transform(X_train[cat_features.columns])
X_test[cat_features.columns] = mf.transform(X_test[cat_features.columns])

[IterativeImputer] Completing matrix with shape (59269, 13)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.02
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.03
[IterativeImputer] Completing matrix with shape (14818, 13)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.01
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.01


In [19]:
# One hot encode values using get_dummies() function
for k, v in features_miss.items():
   dummy_train = pd.get_dummies(X_train[k])
   dummy_train = dummy_train.rename(columns=v)
   dummy_test = pd.get_dummies(X_test[k])
   dummy_test = dummy_test.rename(columns=v)
   X_train.drop([k],axis=1, inplace=True)
   X_test.drop([k],axis=1, inplace=True)
   X_train = pd.concat([X_train, dummy_train], axis=1)
   X_test = pd.concat([X_test, dummy_test], axis=1)


In [20]:
# Oversampling (SMOTE) with power transformation seem to get best results
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

oversample = SMOTE()
print(f'y_train: {Counter(y_train)}')
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(f'y_train: {Counter(y_train)}')

y_train: Counter({0: 58472, 1: 797})
y_train: Counter({0: 58472, 1: 58472})


In [None]:
'''
# Undersampling isn't as good as oversampling
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import RandomUnderSampler

undersample = NeighbourhoodCleaningRule()
print(f'y_train: {Counter(y_train)}')
X_train, y_train = undersample.fit_resample(X_train, y_train)
print(f'y_train: {Counter(y_train)}')
'''

In [21]:
from sklearn.neighbors import KNeighborsClassifier

# define KNN model
knn = KNeighborsClassifier(n_neighbors=2, p=1)

knn.fit(X_train.values, y_train.values)

In [22]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Hyperparamter tuning

# Weights from 0 to 99
weights = np.linspace(0.0,0.99,10)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

cv = StratifiedKFold()

# define model
rf = RandomForestClassifier()

#Fitting grid search to the train data with 5 folds
gridsearch_rf = GridSearchCV(
   estimator= rf,
   param_grid= param_grid,
   cv=StratifiedKFold(), 
   n_jobs=-1, 
   scoring='f1',
   verbose=2)

gridsearch_rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [23]:
import xgboost as xgb

# Create regression matrices
train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Hyperparameter tuning for XGBoost
grid = pd.DataFrame({'eta':[0.01,0.05,0.1]*2, 'subsample':np.repeat([0.1,0.3],3)})

def fit(x):
    params = {'objective':'binary:logistic',
              'eval_metric':'logloss',
              'eta':x[0],
              'subsample':x[1]
    }
    
    xgb_cv = xgb.cv(
        dtrain=train, 
        params=params, 
        nfold=5,
        metrics = 'logloss',
        seed=42
    )
    
    return xgb_cv[-1:].values[0]

cols = ['train-logloss-mean','train-logloss-std', 'test-logloss-mean','test-logloss-std']
grid[cols] = grid.apply(fit, axis=1, result_type='expand')

grid

Unnamed: 0,eta,subsample,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.01,0.1,0.634213,2.1e-05,0.634302,0.000143
1,0.05,0.1,0.475448,0.000536,0.475858,0.000693
2,0.1,0.1,0.369181,0.001199,0.370053,0.000947
3,0.01,0.3,0.633743,7.4e-05,0.633864,0.000156
4,0.05,0.3,0.474005,0.000387,0.474582,0.000663
5,0.1,0.3,0.368351,0.00083,0.369309,0.000262


In [24]:
from xgboost import XGBClassifier

# Default XGBoost
model_xgb = XGBClassifier(
  objective='binary:logistic',
  eval_metric='logloss'
)

model_xgb.fit(X_train, y_train)

In [25]:
from xgboost import XGBClassifier

# XGBoost with Hyperparameter Tuning
model2_xgb = XGBClassifier(
  objective='binary:logistic',
  eval_metric='logloss',
  eta=0.01,
  subsample=0.3
)

model2_xgb.fit(X_train, y_train)

This shows that XGBoost is the best model etc.

In [27]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Results for KNN
y_pred = knn.predict(X_test.values)
classification_rep = classification_report(y_test, y_pred)
print(f"\nClassification Report (KNN):\n", classification_rep)

# Results for Random Forest with Gridsearch Hyperparameter Tuning
y_pred = gridsearch_rf.predict(X_test)
classification_rep = classification_report(y_test, y_pred)
print(f"\nClassification Report (Random Forest):\n", classification_rep)


Classification Report (KNN):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     14619
           1       0.32      0.32      0.32       199

    accuracy                           0.98     14818
   macro avg       0.65      0.65      0.65     14818
weighted avg       0.98      0.98      0.98     14818


Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     14619
           1       0.66      0.35      0.45       199

    accuracy                           0.99     14818
   macro avg       0.82      0.67      0.72     14818
weighted avg       0.99      0.99      0.99     14818



In [28]:
# Results for XGBoost (Default)
y_pred = model_xgb.predict(X_test)
classification_rep = classification_report(y_test, y_pred)
print(f"\nClassification Report (XGBoost):\n", classification_rep)

# Results for XGBoost with Hyperparameter Tuning
y_pred = model2_xgb.predict(X_test)
classification_rep = classification_report(y_test, y_pred)
print(f"\nClassification Report (XGBoost with Hyperparameter Tuning):\n", classification_rep)


Classification Report (XGBoost):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     14619
           1       0.59      0.40      0.48       199

    accuracy                           0.99     14818
   macro avg       0.79      0.70      0.74     14818
weighted avg       0.99      0.99      0.99     14818


Classification Report (XGBoost with Hyperparameter Tuning):
               precision    recall  f1-score   support

           0       1.00      0.81      0.89     14619
           1       0.06      0.84      0.11       199

    accuracy                           0.81     14818
   macro avg       0.53      0.83      0.50     14818
weighted avg       0.98      0.81      0.88     14818

