In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score

In [5]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/justinkirk8/Accidentally_Late/main/accidents_sample_joined.csv')
df.head()

Unnamed: 0,severity,state_code,region,division,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,year,date,classification
0,short_delay,CA,West,Pacific,76.0,10.0,9.0,0.0,clear_weather,Day,2019,2019-06-27,PreCOVID
1,short_delay,OR,West,Pacific,67.0,10.0,0.0,0.0,clear_weather,Day,2019,2019-06-05,PreCOVID
2,long_delay,VA,South,South Atlantic,52.0,10.0,8.0,0.0,clear_weather,Day,2019,2019-04-01,PreCOVID
3,short_delay,CA,West,Pacific,66.0,10.0,6.0,0.0,clear_weather,Night,2019,2019-10-06,PreCOVID
4,short_delay,CA,West,Pacific,59.0,10.0,0.0,0.0,clear_weather,Night,2020,2020-02-06,PreCOVID


In [6]:
# Check dtypes, features should be float64
df.dtypes

severity              object
state_code            object
region                object
division              object
temperature_f        float64
visibility_mi        float64
wind_speed_mph       float64
precipitation_in     float64
weather_condition     object
sunrise_sunset        object
year                   int64
date                  object
classification        object
dtype: object

In [7]:
# Get the number of rows
df.shape[0]

500000

In [8]:
# Get count of unique values in the 'Severity' column
print(df['severity'].value_counts())

short_delay    446404
long_delay      53596
Name: severity, dtype: int64


In [9]:
# Get count of unique values in the 'state_code', 'region', 'division'
print(df['state_code'].value_counts())
print(df['region'].value_counts())
print(df['division'].value_counts())

CA    175500
FL     52755
OR     40126
MN     24114
TX     18629
NY     16277
VA     14482
UT     14440
PA     12501
SC     11533
NC     11286
NJ      9392
IL      7949
MI      7783
TN      7225
MD      7061
AZ      6866
GA      6617
LA      6446
CO      6158
WA      5942
CT      4645
MO      3653
AL      2943
IN      2723
OH      2378
MT      1983
AR      1831
IA      1355
WI      1280
ID      1267
DC      1252
WV      1164
KS      1152
OK       997
MA       993
NH       986
KY       959
MS       884
RI       845
NE       725
ME       689
DE       688
NV       619
NM       360
ND       235
VT       142
WY       126
SD        44
Name: state_code, dtype: int64
West         253387
South        146752
Midwest       53391
Northeast     46470
Name: region, dtype: int64
Pacific               221568
South Atlantic        106838
Middle Atlantic        38170
Mountain               31819
West North Central     31278
West South Central     27903
East North Central     22113
East South Central    

In [10]:
# Drop state_code, year, date and division columns
df.drop('state_code', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.drop('division', axis=1, inplace=True)
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


In [11]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())
print(df['classification'].value_counts())

clear_weather    422459
bad_weather       77541
Name: weather_condition, dtype: int64
Day      318107
Night    181893
Name: sunrise_sunset, dtype: int64
PreCOVID    250000
COVID       250000
Name: classification, dtype: int64


In [12]:
# Transform "sunrise_sunset", "weather_condition", and "classification" to binary feature
df.sunrise_sunset = df.sunrise_sunset.replace({'Day': 0, 'Night': 1}) 
df.weather_condition = df.weather_condition.replace({'clear_weather': 0, 'bad_weather': 1})
df.classification = df.classification.replace({'PreCOVID': 0, 'COVID': 1})
df.severity = df.severity.replace({'short_delay': 0, 'long_delay': 1})
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,0,West,76.0,10.0,9.0,0.0,0,0,0
1,0,West,67.0,10.0,0.0,0.0,0,0,0
2,1,South,52.0,10.0,8.0,0.0,0,0,0
3,0,West,66.0,10.0,6.0,0.0,0,1,0
4,0,West,59.0,10.0,0.0,0.0,0,1,0


In [13]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())

0    422459
1     77541
Name: weather_condition, dtype: int64
0    318107
1    181893
Name: sunrise_sunset, dtype: int64


In [14]:
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
0,0,76.0,10.0,9.0,0.0,0,0,0,0,0,0,1
1,0,67.0,10.0,0.0,0.0,0,0,0,0,0,0,1
2,1,52.0,10.0,8.0,0.0,0,0,0,0,0,1,0
3,0,66.0,10.0,6.0,0.0,0,1,0,0,0,0,1
4,0,59.0,10.0,0.0,0.0,0,1,0,0,0,0,1


In [15]:
# Calculate Correlations
df.corr()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
severity,1.0,0.003532,0.01319,0.058773,0.023826,0.035403,-0.011416,-0.216009,0.118147,0.05437,0.062147,-0.161163
temperature_f,0.003532,1.0,0.257321,0.051462,-0.015129,-0.226509,-0.330502,0.100471,-0.323552,-0.110474,0.280313,0.00871
visibility_mi,0.01319,0.257321,1.0,0.010734,-0.213459,-0.618028,-0.053275,0.037869,-0.092183,-0.037348,0.063104,0.021158
wind_speed_mph,0.058773,0.051462,0.010734,1.0,0.051952,0.081066,-0.201681,0.017769,0.141519,0.047612,0.033598,-0.145672
precipitation_in,0.023826,-0.015129,-0.213459,0.051952,1.0,0.278432,-0.004037,-0.013433,-0.003201,0.019234,0.034869,-0.040953
weather_condition,0.035403,-0.226509,-0.618028,0.081066,0.278432,1.0,0.022299,-0.051997,0.07725,0.047618,-0.042656,-0.036522
sunrise_sunset,-0.011416,-0.330502,-0.053275,-0.201681,-0.004037,0.022299,1.0,0.041826,0.016464,-0.003467,-0.036265,0.024874
classification,-0.216009,0.100471,0.037869,0.017769,-0.013433,-0.051997,0.041826,1.0,-0.070555,0.078692,0.362204,-0.332018
region_Midwest,0.118147,-0.323552,-0.092183,0.141519,-0.003201,0.07725,0.016464,-0.070555,1.0,-0.110676,-0.222855,-0.350473
region_Northeast,0.05437,-0.110474,-0.037348,0.047612,0.019234,0.047618,-0.003467,0.078692,-0.110676,1.0,-0.206317,-0.324465


## **SPLIT THE DATA INTO TRAINING AND TESTING**

In [16]:
# Create our target

y = df["severity"].ravel()
X = df.drop(columns="severity")
y[:5]

array([0, 0, 1, 0, 0])

In [17]:
X.describe()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,60.577466,8.977068,7.068161,0.006234,0.155082,0.363786,0.5,0.106782,0.09294,0.293504,0.506774
std,18.617263,2.813977,5.454424,0.045174,0.361983,0.481089,0.500001,0.308836,0.290349,0.455368,0.499955
min,-27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.0,10.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62.0,10.0,7.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0
75%,75.0,10.0,10.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
max,196.0,100.0,169.0,9.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
from sklearn.model_selection import train_test_split
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Counter(y_train)
Counter(y_train)



Counter({0: 357193, 1: 42807})

### **Random Forest Classifier**

In [19]:
#from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
#scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
#X_scaler = scaler.fit(X_train)

# Scaling the data.
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [22]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8989


In [24]:
# Display the confusion matrix - RandomFC
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual short_delay", "Actual long_delay"], columns=["Predicted short_delay", "Predicted long_delay"])
cm_df

Unnamed: 0,Predicted short_delay,Predicted long_delay
Actual short_delay,86573,2638
Actual long_delay,7472,3317


In [25]:
# Print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     89211
           1       0.56      0.31      0.40     10789

    accuracy                           0.90    100000
   macro avg       0.74      0.64      0.67    100000
weighted avg       0.88      0.90      0.89    100000



### **Hyperparameter - GridSearchCV **

In [26]:
# Number of tress in random forest
# n_estimators = (20, 60, 100, 120)
n_estimators = [20, 60, 100]

# Number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# Maximum number of levels in tree
max_depth = [2, 8, None]

# Number of samples
max_samples = [0.5, 0.75, 1.0]

# 108 diff random forest train 4 x 3 x 3 x 3 = 108

In [27]:
# define param grid
param_grid = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'max_samples': max_samples}


In [28]:
rf = RandomForestClassifier()

In [29]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf,
                      param_grid = param_grid,
                      cv = 5,
                      verbose = 2,
                      n_jobs = -1)

In [30]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [31]:
rf_grid.best_params_

{'max_depth': 8, 'max_features': 0.6, 'max_samples': 0.75, 'n_estimators': 60}

In [32]:
rf_grid.best_score_

0.9000724999999999

### **Hyperparameter - RandomSearchCV **

In [33]:
# Number of tress in random forest
#n_estimators = (20, 60, 100, 120)
n_estimators = [20, 60, 100]

# Number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# Maximum number of levels in tree
max_depth = [2, 8, None]

# Number of samples
max_samples = [0.5, 0.75, 1.0]

# Bootstrap samples
bootstrap = [True, False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# 108 diff random forest train 4 x 3 x 3 x 3 = 108



In [34]:
# define param grid
param_grid = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'max_samples': max_samples,
             'bootstrap': bootstrap,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf}
print(param_grid)

{'n_estimators': [20, 60, 100], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [35]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator=rf, 
                             param_distributions = param_grid,
                            cv=5,
                            verbose=2,
                            n_jobs = -1,)

In [36]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=None, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 2.7min
[CV] END bootstrap=False, max_depth=8, max_features=1.0, max_samples=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=8, max_features=1.0, max_samples=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time= 1.2min
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time= 1.1min
[CV] END bootstrap=True, max_depth=None, max_features=0.2, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 1.4min
[CV] END bootstrap=True

In [37]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 0.5,
 'max_features': 0.6,
 'max_depth': 8,
 'bootstrap': True}

[CV] END bootstrap=True, max_depth=None, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 2.8min
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time= 1.2min
[CV] END bootstrap=True, max_depth=None, max_features=0.2, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 1.0min
[CV] END bootstrap=True, max_depth=None, max_features=0.2, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 1.4min
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.1min
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.6min
[CV] END bootstrap=True, max_depth=8, max_features=0.6, max_samples=0.5, min_sam

In [38]:
rf_grid.best_score_

0.8999525

[CV] END bootstrap=True, max_depth=None, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 2.7min
[CV] END bootstrap=True, max_depth=None, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 2.3min
[CV] END bootstrap=True, max_depth=None, max_features=0.2, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time= 1.4min
[CV] END bootstrap=True, max_depth=None, max_features=1.0, max_samples=1.0, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.1min
[CV] END bootstrap=False, max_depth=8, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END bootstrap=False, max_depth=8, max_features=0.6, max_samples=1.0, min_samples_leaf=1, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END bootstrap=False, max_depth=8, max_features=0.6, max_samples=1.0, min_samples