In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/justinkirk8/Accidentally_Late/main/accidents_sample_joined.csv')
df.head()

Unnamed: 0,severity,state_code,region,division,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,year,date,classification
0,short_delay,CA,West,Pacific,76.0,10.0,9.0,0.0,clear_weather,Day,2019,2019-06-27,PreCOVID
1,short_delay,OR,West,Pacific,67.0,10.0,0.0,0.0,clear_weather,Day,2019,2019-06-05,PreCOVID
2,long_delay,VA,South,South Atlantic,52.0,10.0,8.0,0.0,clear_weather,Day,2019,2019-04-01,PreCOVID
3,short_delay,CA,West,Pacific,66.0,10.0,6.0,0.0,clear_weather,Night,2019,2019-10-06,PreCOVID
4,short_delay,CA,West,Pacific,59.0,10.0,0.0,0.0,clear_weather,Night,2020,2020-02-06,PreCOVID


In [5]:
# Check dtypes, factors should be float64
df.dtypes


severity              object
state_code            object
region                object
division              object
temperature_f        float64
visibility_mi        float64
wind_speed_mph       float64
precipitation_in     float64
weather_condition     object
sunrise_sunset        object
year                   int64
date                  object
classification        object
dtype: object

In [6]:
# Check the number of rows again; Data cleanup already dropped NAs
df.shape[0]

500000

In [7]:
# Get count of unique values in the 'Severity' column
print(df['severity'].value_counts())

short_delay    446404
long_delay      53596
Name: severity, dtype: int64


In [8]:
# Get count of unique values in the 'state_code', 'region', 'division'
print(df['state_code'].value_counts())
print(df['region'].value_counts())
print(df['division'].value_counts())

CA    175500
FL     52755
OR     40126
MN     24114
TX     18629
NY     16277
VA     14482
UT     14440
PA     12501
SC     11533
NC     11286
NJ      9392
IL      7949
MI      7783
TN      7225
MD      7061
AZ      6866
GA      6617
LA      6446
CO      6158
WA      5942
CT      4645
MO      3653
AL      2943
IN      2723
OH      2378
MT      1983
AR      1831
IA      1355
WI      1280
ID      1267
DC      1252
WV      1164
KS      1152
OK       997
MA       993
NH       986
KY       959
MS       884
RI       845
NE       725
ME       689
DE       688
NV       619
NM       360
ND       235
VT       142
WY       126
SD        44
Name: state_code, dtype: int64
West         253387
South        146752
Midwest       53391
Northeast     46470
Name: region, dtype: int64
Pacific               221568
South Atlantic        106838
Middle Atlantic        38170
Mountain               31819
West North Central     31278
West South Central     27903
East North Central     22113
East South Central    

In [9]:
# Drop state_code, year, date and division columns
df.drop('state_code', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.drop('division', axis=1, inplace=True)
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,clear_weather,Day,PreCOVID
1,short_delay,West,67.0,10.0,0.0,0.0,clear_weather,Day,PreCOVID
2,long_delay,South,52.0,10.0,8.0,0.0,clear_weather,Day,PreCOVID
3,short_delay,West,66.0,10.0,6.0,0.0,clear_weather,Night,PreCOVID
4,short_delay,West,59.0,10.0,0.0,0.0,clear_weather,Night,PreCOVID


In [10]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())
print(df['classification'].value_counts())

clear_weather    422459
bad_weather       77541
Name: weather_condition, dtype: int64
Day      318107
Night    181893
Name: sunrise_sunset, dtype: int64
PreCOVID    250000
COVID       250000
Name: classification, dtype: int64


In [11]:
# Transform "sunrise_sunset", "weather_condition", and "classification" to binary feature
df.sunrise_sunset = df.sunrise_sunset.replace({'Day': 1, 'Night': 0}) 
df.weather_condition = df.weather_condition.replace({'clear_weather': 1, 'bad_weather': 0})
df.classification = df.classification.replace({'PreCOVID': 1, 'COVID': 0})
df.head()

Unnamed: 0,severity,region,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification
0,short_delay,West,76.0,10.0,9.0,0.0,1,1,1
1,short_delay,West,67.0,10.0,0.0,0.0,1,1,1
2,long_delay,South,52.0,10.0,8.0,0.0,1,1,1
3,short_delay,West,66.0,10.0,6.0,0.0,1,0,1
4,short_delay,West,59.0,10.0,0.0,0.0,1,0,1


In [12]:
# Get count of unique values in the 'Weather_Condition' column
print(df['weather_condition'].value_counts())
print(df['sunrise_sunset'].value_counts())

1    422459
0     77541
Name: weather_condition, dtype: int64
1    318107
0    181893
Name: sunrise_sunset, dtype: int64


In [13]:
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,severity,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
0,short_delay,76.0,10.0,9.0,0.0,1,1,1,0,0,0,1
1,short_delay,67.0,10.0,0.0,0.0,1,1,1,0,0,0,1
2,long_delay,52.0,10.0,8.0,0.0,1,1,1,0,0,1,0
3,short_delay,66.0,10.0,6.0,0.0,1,0,1,0,0,0,1
4,short_delay,59.0,10.0,0.0,0.0,1,0,1,0,0,0,1


In [14]:
# Calculate Correlations
df.corr()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
temperature_f,1.0,0.257321,0.051462,-0.015129,0.226509,0.330502,-0.100471,-0.323552,-0.110474,0.280313,0.00871
visibility_mi,0.257321,1.0,0.010734,-0.213459,0.618028,0.053275,-0.037869,-0.092183,-0.037348,0.063104,0.021158
wind_speed_mph,0.051462,0.010734,1.0,0.051952,-0.081066,0.201681,-0.017769,0.141519,0.047612,0.033598,-0.145672
precipitation_in,-0.015129,-0.213459,0.051952,1.0,-0.278432,0.004037,0.013433,-0.003201,0.019234,0.034869,-0.040953
weather_condition,0.226509,0.618028,-0.081066,-0.278432,1.0,0.022299,-0.051997,-0.07725,-0.047618,0.042656,0.036522
sunrise_sunset,0.330502,0.053275,0.201681,0.004037,0.022299,1.0,0.041826,-0.016464,0.003467,0.036265,-0.024874
classification,-0.100471,-0.037869,-0.017769,0.013433,-0.051997,0.041826,1.0,0.070555,-0.078692,-0.362204,0.332018
region_Midwest,-0.323552,-0.092183,0.141519,-0.003201,-0.07725,-0.016464,0.070555,1.0,-0.110676,-0.222855,-0.350473
region_Northeast,-0.110474,-0.037348,0.047612,0.019234,-0.047618,0.003467,-0.078692,-0.110676,1.0,-0.206317,-0.324465
region_South,0.280313,0.063104,0.033598,0.034869,0.042656,0.036265,-0.362204,-0.222855,-0.206317,1.0,-0.653336


## **SPLIT THE DATA INTO TRAINING AND TESTING - Pre-COVID YEARS**

In [15]:
# Create our target
y = df["severity"]
X = df.drop(columns="severity")

In [16]:
X.describe()

Unnamed: 0,temperature_f,visibility_mi,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset,classification,region_Midwest,region_Northeast,region_South,region_West
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,60.577466,8.977068,7.068161,0.006234,0.844918,0.636214,0.5,0.106782,0.09294,0.293504,0.506774
std,18.617263,2.813977,5.454424,0.045174,0.361983,0.481089,0.500001,0.308836,0.290349,0.455368,0.499955
min,-27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.0,10.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62.0,10.0,7.0,0.0,1.0,1.0,0.5,0.0,0.0,0.0,1.0
75%,75.0,10.0,10.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,196.0,100.0,169.0,9.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
# Check the balance of our target values
y.value_counts()

short_delay    446404
long_delay      53596
Name: severity, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({'short_delay': 334803, 'long_delay': 40197})

### **Naive Random Oversampling - Pre-Covid Years**

In [19]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({'short_delay': 334803, 'long_delay': 334803})

In [20]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [21]:
y_pred = model.predict(X_test)

In [22]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7327128117014536

In [23]:
# Display the confusion matrix - Naive_R_O

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual long_delay", "Actual short_delay"], columns=["Predicted long_delay", "Predicted short_delay"])
cm_df

Unnamed: 0,Predicted long_delay,Predicted short_delay
Actual long_delay,8032,5367
Actual short_delay,14957,96644


In [24]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 long_delay       0.35      0.60      0.87      0.44      0.72      0.51     13399
short_delay       0.95      0.87      0.60      0.90      0.72      0.53    111601

avg / total       0.88      0.84      0.63      0.86      0.72      0.53    125000

