In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/justinkirk8/Accidentally_Late/main/accidents_sample_joined.csv')
df.head()

Unnamed: 0,severity,state_code,region,division,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,year,date,classification
0,short_delay,CA,West,Pacific,76.0,10.0,9.0,0.0,clear_weather,Day,2019,2019-06-27,PreCOVID
1,short_delay,OR,West,Pacific,67.0,10.0,0.0,0.0,clear_weather,Day,2019,2019-06-05,PreCOVID
2,long_delay,VA,South,South Atlantic,52.0,10.0,8.0,0.0,clear_weather,Day,2019,2019-04-01,PreCOVID
3,short_delay,CA,West,Pacific,66.0,10.0,6.0,0.0,clear_weather,Night,2019,2019-10-06,PreCOVID
4,short_delay,CA,West,Pacific,59.0,10.0,0.0,0.0,clear_weather,Night,2020,2020-02-06,PreCOVID


In [5]:
# Check dtypes, features should be float64
df.dtypes

severity              object
state_code            object
region                object
division              object
temperature_f        float64
visibility_mi        float64
wind_speed_mph       float64
precipitation_in     float64
weather_condition     object
sunrise_sunset        object
year                   int64
date                  object
classification        object
dtype: object

In [6]:
# Get the number of rows
df.shape[0]

500000

In [7]:
# Get count of unique values in the 'Severity' column
print(df['severity'].value_counts())

short_delay    446404
long_delay      53596
Name: severity, dtype: int64


In [8]:
# Get count of unique values in the 'state_code', 'region', 'division'
print(df['state_code'].value_counts())
print(df['region'].value_counts())
print(df['division'].value_counts())

CA    175500
FL     52755
OR     40126
MN     24114
TX     18629
NY     16277
VA     14482
UT     14440
PA     12501
SC     11533
NC     11286
NJ      9392
IL      7949
MI      7783
TN      7225
MD      7061
AZ      6866
GA      6617
LA      6446
CO      6158
WA      5942
CT      4645
MO      3653
AL      2943
IN      2723
OH      2378
MT      1983
AR      1831
IA      1355
WI      1280
ID      1267
DC      1252
WV      1164
KS      1152
OK       997
MA       993
NH       986
KY       959
MS       884
RI       845
NE       725
ME       689
DE       688
NV       619
NM       360
ND       235
VT       142
WY       126
SD        44
Name: state_code, dtype: int64
West         253387
South        146752
Midwest       53391
Northeast     46470
Name: region, dtype: int64
Pacific               221568
South Atlantic        106838
Middle Atlantic        38170
Mountain               31819
West North Central     31278
West South Central     27903
East North Central     22113
East South Central    

In [9]:
# Drop state_code, year, date and division columns
df.drop('state_code', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.drop('division', axis=1, inplace=True)
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


## **PreCovid Data Processing**

In [10]:
# Filter dataframe for just "PreCOVID" data
df = df.loc[df['classification']== 'COVID']
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
250000,short_delay,West,69.0,10.0,10.0,0.0,clear_weather,Day,COVID
250001,short_delay,West,54.0,10.0,7.0,0.0,clear_weather,Night,COVID
250002,short_delay,South,57.0,10.0,9.0,0.0,clear_weather,Day,COVID
250003,short_delay,South,53.0,7.0,3.0,0.0,clear_weather,Day,COVID
250004,short_delay,South,80.0,10.0,17.0,0.0,clear_weather,Night,COVID


In [11]:
# Drop classification - all are PreCOVID
df = df.drop('classification', axis=1)
df.head

<bound method NDFrame.head of            severity     region  temperature_f  visibility_mi  wind_speed_mph  \
250000  short_delay       West           69.0           10.0            10.0   
250001  short_delay       West           54.0           10.0             7.0   
250002  short_delay      South           57.0           10.0             9.0   
250003  short_delay      South           53.0            7.0             3.0   
250004  short_delay      South           80.0           10.0            17.0   
...             ...        ...            ...            ...             ...   
499995  short_delay       West           91.0           10.0             9.0   
499996   long_delay       West           64.0            1.0             3.0   
499997  short_delay  Northeast           64.0           10.0             3.0   
499998  short_delay    Midwest            9.0            2.0            21.0   
499999  short_delay    Midwest           71.0           10.0             9.0   

        p

In [12]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())
# print(df['classification'].value_counts())

clear_weather    215935
bad_weather       34065
Name: weather_condition, dtype: int64
Day      154023
Night     95977
Name: sunrise_sunset, dtype: int64


In [13]:
# Transform "sunrise_sunset", "weather_condition", and "classification" to binary feature
df.sunrise_sunset = df.sunrise_sunset.replace({'Day': 0, 'Night': 1}) 
df.weather_condition = df.weather_condition.replace({'clear_weather': 0, 'bad_weather': 1})
# df.classification = df.classification.replace({'PreCOVID': 0, 'COVID': 1})
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset
250000,short_delay,West,69.0,10.0,10.0,0.0,0,0
250001,short_delay,West,54.0,10.0,7.0,0.0,0,1
250002,short_delay,South,57.0,10.0,9.0,0.0,0,0
250003,short_delay,South,53.0,7.0,3.0,0.0,0,0
250004,short_delay,South,80.0,10.0,17.0,0.0,0,1


In [14]:
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
250000,short_delay,69.0,10.0,10.0,0.0,0,0,0,0,0,1
250001,short_delay,54.0,10.0,7.0,0.0,0,1,0,0,0,1
250002,short_delay,57.0,10.0,9.0,0.0,0,0,0,0,1,0
250003,short_delay,53.0,7.0,3.0,0.0,0,0,0,0,1,0
250004,short_delay,80.0,10.0,17.0,0.0,0,1,0,0,1,0


In [15]:
# Calculate Correlations
df.corr()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
temperature_f,1.0,0.2142,0.111459,0.002586,-0.154233,-0.376921,-0.240471,-0.160726,0.293939,-0.059019
visibility_mi,0.2142,1.0,0.043667,-0.238532,-0.631399,-0.052823,-0.044514,-0.016788,0.089855,-0.056938
wind_speed_mph,0.111459,0.043667,1.0,0.049211,0.039977,-0.232379,0.109564,0.036635,0.020735,-0.110994
precipitation_in,0.002586,-0.238532,0.049211,1.0,0.295336,-0.020038,-0.006655,0.006116,0.029271,-0.030985
weather_condition,-0.154233,-0.631399,0.039977,0.295336,1.0,0.007844,0.033521,0.024549,-0.05681,0.023427
sunrise_sunset,-0.376921,-0.052823,-0.232379,-0.020038,0.007844,1.0,0.007159,0.014652,-0.04963,0.038071
region_Midwest,-0.240471,-0.044514,0.109564,-0.006655,0.033521,0.007159,1.0,-0.110288,-0.280411,-0.219128
region_Northeast,-0.160726,-0.016788,0.036635,0.006116,0.024549,0.014652,-0.110288,1.0,-0.332944,-0.260181
region_South,0.293939,0.089855,0.020735,0.029271,-0.05681,-0.04963,-0.280411,-0.332944,1.0,-0.661515
region_West,-0.059019,-0.056938,-0.110994,-0.030985,0.023427,0.038071,-0.219128,-0.260181,-0.661515,1.0


## **SPLIT THE DATA INTO TRAINING AND TESTING**

In [16]:
# Create our target

y = df["severity"].ravel()
X = df.drop(columns="severity")
y[:5]

array(['short_delay', 'short_delay', 'short_delay', 'short_delay',
       'short_delay'], dtype=object)

In [17]:
X.describe()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,62.447956,9.08363,7.16508,0.005627,0.13626,0.383908,0.084992,0.115788,0.45844,0.34078
std,18.330288,2.553535,5.372287,0.041814,0.343065,0.486337,0.27887,0.319971,0.498271,0.473972
min,-26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,10.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,64.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,77.0,10.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,196.0,100.0,131.0,2.47,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
from sklearn.model_selection import train_test_split
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y_train)



Counter({'short_delay': 179994, 'long_delay': 7506})

### **Random Forest Classifier**

In [19]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.956256


In [24]:
# Display the confusion matrix - RandomFC
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual long_delay", "Actual short_delay"], columns=["Predicted long_delay", "Predicted short_delay"])
cm_df

Unnamed: 0,Predicted long_delay,Predicted short_delay
Actual long_delay,84,2502
Actual short_delay,232,59682


In [25]:
# Print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

  long_delay       0.27      0.03      0.06      2586
 short_delay       0.96      1.00      0.98     59914

    accuracy                           0.96     62500
   macro avg       0.61      0.51      0.52     62500
weighted avg       0.93      0.96      0.94     62500



### **Ranked Features by Importance - PreCOVID**

In [26]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.49306216, 0.12018944, 0.23679093, 0.07926814, 0.0130602 ,
       0.01346171, 0.01518544, 0.01217909, 0.00726419, 0.0095387 ])

In [27]:
# We can sort the features by their importance. Can try improving model by dropping lower ranked features. 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4930621591992331, 'temperature_f'),
 (0.23679093046410468, 'wind_speed_mph'),
 (0.12018944344226999, 'visibility_mi'),
 (0.07926814076392695, 'precipitation_in'),
 (0.01518544023572357, 'region_Midwest'),
 (0.013461709006597922, 'sunrise_sunset'),
 (0.013060196350102731, 'weather_condition'),
 (0.0121790921085715, 'region_Northeast'),
 (0.009538702552422743, 'region_West'),
 (0.007264185877046743, 'region_South')]

In [29]:
# Creat dataframe of features ranked
export = pd.DataFrame(importances, X.columns)
export.head()

Unnamed: 0,0
temperature_f,0.493062
visibility_mi,0.120189
wind_speed_mph,0.236791
precipitation_in,0.079268
weather_condition,0.01306


In [30]:
# Creating CSV file for features ranked 
export.to_csv('COVID_Features_ranked.csv')