In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [5]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/justinkirk8/Accidentally_Late/main/accidents_sample_joined.csv')
df.head()

Unnamed: 0,severity,state_code,region,division,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,year,date,classification
0,short_delay,CA,West,Pacific,76.0,10.0,9.0,0.0,clear_weather,Day,2019,2019-06-27,PreCOVID
1,short_delay,OR,West,Pacific,67.0,10.0,0.0,0.0,clear_weather,Day,2019,2019-06-05,PreCOVID
2,long_delay,VA,South,South Atlantic,52.0,10.0,8.0,0.0,clear_weather,Day,2019,2019-04-01,PreCOVID
3,short_delay,CA,West,Pacific,66.0,10.0,6.0,0.0,clear_weather,Night,2019,2019-10-06,PreCOVID
4,short_delay,CA,West,Pacific,59.0,10.0,0.0,0.0,clear_weather,Night,2020,2020-02-06,PreCOVID


In [6]:
# Check dtypes, features should be float64
df.dtypes

severity              object
state_code            object
region                object
division              object
temperature_f        float64
visibility_mi        float64
wind_speed_mph       float64
precipitation_in     float64
weather_condition     object
sunrise_sunset        object
year                   int64
date                  object
classification        object
dtype: object

In [7]:
# Get the number of rows
df.shape[0]

500000

In [8]:
# Get count of unique values in the 'Severity' column
print(df['severity'].value_counts())

short_delay    446404
long_delay      53596
Name: severity, dtype: int64


In [9]:
# Get count of unique values in the 'state_code', 'region', 'division'
print(df['state_code'].value_counts())
print(df['region'].value_counts())
print(df['division'].value_counts())

CA    175500
FL     52755
OR     40126
MN     24114
TX     18629
NY     16277
VA     14482
UT     14440
PA     12501
SC     11533
NC     11286
NJ      9392
IL      7949
MI      7783
TN      7225
MD      7061
AZ      6866
GA      6617
LA      6446
CO      6158
WA      5942
CT      4645
MO      3653
AL      2943
IN      2723
OH      2378
MT      1983
AR      1831
IA      1355
WI      1280
ID      1267
DC      1252
WV      1164
KS      1152
OK       997
MA       993
NH       986
KY       959
MS       884
RI       845
NE       725
ME       689
DE       688
NV       619
NM       360
ND       235
VT       142
WY       126
SD        44
Name: state_code, dtype: int64
West         253387
South        146752
Midwest       53391
Northeast     46470
Name: region, dtype: int64
Pacific               221568
South Atlantic        106838
Middle Atlantic        38170
Mountain               31819
West North Central     31278
West South Central     27903
East North Central     22113
East South Central    

In [10]:
# Drop state_code, year, date and division columns
df.drop('state_code', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.drop('division', axis=1, inplace=True)
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


## **PreCovid Data Processing**

In [11]:
# Filter dataframe for just "PreCOVID" data
df = df.loc[df['classification']== 'PreCOVID']
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


In [13]:
# Drop classification - all are PreCOVID
df = df.drop('classification', axis=1)
df.head

<bound method NDFrame.head of            severity region  temperature_f  visibility_mi  wind_speed_mph  \
0       short_delay   West           76.0           10.0             9.0   
1       short_delay   West           67.0           10.0             0.0   
2        long_delay  South           52.0           10.0             8.0   
3       short_delay   West           66.0           10.0             6.0   
4       short_delay   West           59.0           10.0             0.0   
...             ...    ...            ...            ...             ...   
249995  short_delay   West           61.0           10.0             0.0   
249996  short_delay   West           61.0            7.0             0.0   
249997  short_delay   West           44.0           10.0             0.0   
249998  short_delay   West           73.0           10.0             3.0   
249999  short_delay   West           72.0           10.0             7.0   

        precipitation_in weather_condition sunrise_sunset

In [15]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())
# print(df['classification'].value_counts())

clear_weather    206524
bad_weather       43476
Name: weather_condition, dtype: int64
Day      164084
Night     85916
Name: sunrise_sunset, dtype: int64


In [16]:
# Transform "sunrise_sunset", "weather_condition", and "classification" to binary feature
df.sunrise_sunset = df.sunrise_sunset.replace({'Day': 0, 'Night': 1}) 
df.weather_condition = df.weather_condition.replace({'clear_weather': 0, 'bad_weather': 1})
# df.classification = df.classification.replace({'PreCOVID': 0, 'COVID': 1})
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset
0,short_delay,West,76.0,10.0,9.0,0.0,0,0
1,short_delay,West,67.0,10.0,0.0,0.0,0,0
2,long_delay,South,52.0,10.0,8.0,0.0,0,0
3,short_delay,West,66.0,10.0,6.0,0.0,0,1
4,short_delay,West,59.0,10.0,0.0,0.0,0,1


In [18]:
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
0,short_delay,76.0,10.0,9.0,0.0,0,0,0,0,0,1
1,short_delay,67.0,10.0,0.0,0.0,0,0,0,0,0,1
2,long_delay,52.0,10.0,8.0,0.0,0,0,0,0,1,0
3,short_delay,66.0,10.0,6.0,0.0,0,1,0,0,0,1
4,short_delay,59.0,10.0,0.0,0.0,0,1,0,0,0,1


In [19]:
# Calculate Correlations
df.corr()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
temperature_f,1.0,0.289861,-0.008565,-0.027839,-0.28375,-0.296526,-0.384721,-0.070562,0.231665,0.147585
visibility_mi,0.289861,1.0,-0.017203,-0.194744,-0.607474,-0.05728,-0.121402,-0.067046,0.011998,0.114529
wind_speed_mph,-0.008565,-0.017203,1.0,0.054876,0.119235,-0.173205,0.171068,0.059034,0.04288,-0.18475
precipitation_in,-0.027839,-0.194744,0.054876,1.0,0.264775,0.011181,-0.00236,0.036535,0.063403,-0.063427
weather_condition,-0.28375,-0.607474,0.119235,0.264775,1.0,0.040083,0.104397,0.084479,0.014641,-0.130887
sunrise_sunset,-0.296526,-0.05728,-0.173205,0.011181,0.040083,1.0,0.030184,-0.034376,-0.066052,0.044291
region_Midwest,-0.384721,-0.121402,0.171068,-0.00236,0.104397,0.030184,1.0,-0.105456,-0.147539,-0.55076
region_Northeast,-0.070562,-0.067046,0.059034,0.036535,0.084479,-0.034376,-0.105456,1.0,-0.105454,-0.393658
region_South,0.231665,0.011998,0.04288,0.063403,0.014641,-0.066052,-0.147539,-0.105454,1.0,-0.55075
region_West,0.147585,0.114529,-0.18475,-0.063427,-0.130887,0.044291,-0.55076,-0.393658,-0.55075,1.0


## **SPLIT THE DATA INTO TRAINING AND TESTING**

In [20]:
# Create our target

y = df["severity"].ravel()
X = df.drop(columns="severity")
y[:5]

array(['short_delay', 'short_delay', 'long_delay', 'short_delay',
       'short_delay'], dtype=object)

In [21]:
X.describe()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,region_Midwest,region_Northeast,region_South,region_West
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,58.706975,8.870506,6.971241,0.006841,0.173904,0.343664,0.128572,0.070092,0.128568,0.672768
std,18.713882,3.048559,5.533657,0.048294,0.379028,0.474932,0.334726,0.255303,0.334722,0.469204
min,-27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,10.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,60.0,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,72.0,10.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,113.0,90.0,169.0,9.99,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
from sklearn.model_selection import train_test_split
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y_train)



Counter({'short_delay': 154710, 'long_delay': 32790})

### **Random Forest Classifier**

In [23]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [25]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8412


In [28]:
# Display the confusion matrix - RandomFC
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual long_delay", "Actual short_delay"], columns=["Predicted long_delay", "Predicted short_delay"])
cm_df

Unnamed: 0,Predicted long_delay,Predicted short_delay
Actual long_delay,4014,6700
Actual short_delay,3225,48561


In [29]:
# Print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

  long_delay       0.55      0.37      0.45     10714
 short_delay       0.88      0.94      0.91     51786

    accuracy                           0.84     62500
   macro avg       0.72      0.66      0.68     62500
weighted avg       0.82      0.84      0.83     62500



### **Ranked Features by Importance - PreCOVID**

In [30]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.33609426, 0.10279783, 0.18863953, 0.0515212 , 0.00856138,
       0.02429524, 0.01970688, 0.011804  , 0.09694812, 0.15963157])

In [31]:
# We can sort the features by their importance. Can try improving model by dropping lower ranked features. 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.336094263553285, 'temperature_f'),
 (0.1886395278224602, 'wind_speed_mph'),
 (0.15963156626278513, 'region_West'),
 (0.1027978259810792, 'visibility_mi'),
 (0.09694812229498559, 'region_South'),
 (0.05152119682946069, 'precipitation_in'),
 (0.024295235219560906, 'sunrise_sunset'),
 (0.019706876742237398, 'region_Midwest'),
 (0.011804000670941088, 'region_Northeast'),
 (0.00856138462320479, 'weather_condition')]

In [32]:
# Creat dataframe of features ranked
export = pd.DataFrame(importances, X.columns)
export.head()

Unnamed: 0,0
temperature_f,0.336094
visibility_mi,0.102798
wind_speed_mph,0.18864
precipitation_in,0.051521
weather_condition,0.008561


In [33]:
# Creating CSV file for features ranked 
export.to_csv('PreCOVID_Features_ranked.csv')