In [1]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
%matplotlib inline
# Import our dependencies

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
import hvplot.pandas
from collections import Counter

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


#sklearn
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression


#imblearn.ensemble
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

#neural network
import tensorflow as tf
import kerastuner as kt
from tensorflow.keras.callbacks import ModelCheckpoint


In [3]:
# setting features and target

features = [    
    'severity', 'region', 'division', 'temperature_f', 'visibility_mi','wind_speed_mph',
    'precipitation_in', 'weather_condition', 'sunrise_sunset', 'classification'
]

target = ["severity"]

In [4]:
# Load the data
file_path = Path('../accidents_sample_joined.csv', index_col=False)
df = pd.read_csv(file_path)
df = df.loc[:, features].copy()
df.head()

Unnamed: 0,severity,region,division,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,Pacific,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,Pacific,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,South Atlantic,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,Pacific,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,Pacific,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


In [5]:
df.dtypes

severity              object
region                object
division              object
temperature_f        float64
visibility_mi        float64
wind_speed_mph       float64
precipitation_in     float64
weather_condition     object
sunrise_sunset        object
classification        object
dtype: object

In [6]:
# severity value count
df['severity'].value_counts()

short_delay    446404
long_delay      53596
Name: severity, dtype: int64

In [7]:
# region value count
df['region'].value_counts()

West         253387
South        146752
Midwest       53391
Northeast     46470
Name: region, dtype: int64

In [8]:
# division value count
df['division'].value_counts()

Pacific               221568
South Atlantic        106838
Middle Atlantic        38170
Mountain               31819
West North Central     31278
West South Central     27903
East North Central     22113
East South Central     12011
New England             8300
Name: division, dtype: int64

In [9]:
# weather conditoin value count
df['weather_condition'].value_counts()

clear_weather    422459
bad_weather       77541
Name: weather_condition, dtype: int64

In [10]:
# sunset sunrise value count
df['sunrise_sunset'].value_counts()

Day      318107
Night    181893
Name: sunrise_sunset, dtype: int64

In [11]:
# classification value count
df['classification'].value_counts()

PreCOVID    250000
COVID       250000
Name: classification, dtype: int64

In [12]:
# Generate our categorical variable lists
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df_cat.remove('severity')
df_cat

['region', 'division', 'weather_condition', 'sunrise_sunset', 'classification']

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()

Unnamed: 0,region_Midwest,region_Northeast,region_South,region_West,division_East North Central,division_East South Central,division_Middle Atlantic,division_Mountain,division_New England,division_Pacific,division_South Atlantic,division_West North Central,division_West South Central,weather_condition_bad_weather,weather_condition_clear_weather,sunrise_sunset_Day,sunrise_sunset_Night,classification_COVID,classification_PreCOVID
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [14]:
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(df_cat,1)
df.head()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,region_Midwest,region_Northeast,region_South,region_West,division_East North Central,division_East South Central,division_Middle Atlantic,division_Mountain,division_New England,division_Pacific,division_South Atlantic,division_West North Central,division_West South Central,weather_condition_bad_weather,weather_condition_clear_weather,sunrise_sunset_Day,sunrise_sunset_Night,classification_COVID,classification_PreCOVID
0,short_delay,76.0,10.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,short_delay,67.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,long_delay,52.0,10.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,short_delay,66.0,10.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,short_delay,59.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


# Random Forest Classifier

In [15]:
# Split our preprocessed data into our features and target arrays
y = df[target]
X = df.drop(target,1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train['severity'])

Counter({'short_delay': 334866, 'long_delay': 40134})

In [16]:
# Resample the training data with the BalancedRandomForestClassifier

model = RandomForestClassifier(random_state=1).fit(X_train, y_train)

In [17]:
#metrics
y_pred = model.predict(X_test)
print(f'Random Forest Classifier \n')
print(f'Accuracy: {balanced_accuracy_score(y_test, y_pred)} \n')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)} \n')
print(f'Imbalanced Classification Report: \n {classification_report_imbalanced(y_test, y_pred)}')

Balanced Random Forest Classifier 

Accuracy: 0.674527642556916 

Confusion Matrix: 
 [[  5161   8301]
 [  3828 107710]] 

Imbalanced Classification Report: 
                    pre       rec       spe        f1       geo       iba       sup

 long_delay       0.57      0.38      0.97      0.46      0.61      0.35     13462
short_delay       0.93      0.97      0.38      0.95      0.61      0.39    111538

avg / total       0.89      0.90      0.45      0.89      0.61      0.39    125000



In [18]:
# List the features sorted in descending order by feature importance
feature_importances = model.feature_importances_
features = sorted(zip(X.columns, model.feature_importances_), key = lambda x: x[1], reverse=True)
features = pd.DataFrame(features)
features

Unnamed: 0,0,1
0,temperature_f,0.312608
1,wind_speed_mph,0.168434
2,classification_COVID,0.079609
3,visibility_mi,0.071484
4,classification_PreCOVID,0.065824
5,division_Pacific,0.051264
6,division_East North Central,0.044326
7,precipitation_in,0.041898
8,region_West,0.040872
9,region_South,0.026345


# Balanced Random Forest Classifier

In [19]:
# Split our preprocessed data into our features and target arrays
y = df[target]
X = df.drop(target,1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train['severity'])

Counter({'short_delay': 334866, 'long_delay': 40134})

In [20]:
# Resample the training data with the BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(random_state=1).fit(X_train, y_train)

In [21]:
#metrics
y_pred = model.predict(X_test)
print(f'Balanced Random Forest Classifier \n')
print(f'Accuracy: {balanced_accuracy_score(y_test, y_pred)} \n')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)} \n')
print(f'Imbalanced Classification Report: \n {classification_report_imbalanced(y_test, y_pred)}')

Balanced Random Forest Classifier 

Accuracy: 0.7801367239178205 

Confusion Matrix: 
 [[10115  3347]
 [21315 90223]] 

Imbalanced Classification Report: 
                    pre       rec       spe        f1       geo       iba       sup

 long_delay       0.32      0.75      0.81      0.45      0.78      0.60     13462
short_delay       0.96      0.81      0.75      0.88      0.78      0.61    111538

avg / total       0.90      0.80      0.76      0.83      0.78      0.61    125000



In [22]:
# List the features sorted in descending order by feature importance
feature_importances = model.feature_importances_
features = sorted(zip(X.columns, model.feature_importances_), key = lambda x: x[1], reverse=True)
features = pd.DataFrame(features)
features

Unnamed: 0,0,1
0,temperature_f,0.293318
1,wind_speed_mph,0.142464
2,classification_PreCOVID,0.102263
3,classification_COVID,0.098835
4,division_Pacific,0.081956
5,region_West,0.058443
6,visibility_mi,0.053664
7,division_East North Central,0.030076
8,precipitation_in,0.029646
9,division_Mountain,0.023365
