In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    "Severity", "State", "Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", 
    "Weather_Condition", "Sunrise_Sunset", "year", "date"
]

target = ["Severity"]

In [5]:
# Load the data
file_path = Path('PreCOVID_accidents.csv', index_col=False)
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()
df.head()

Unnamed: 0,Severity,State,Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [6]:
# Check dtypes, features should be float64
df.dtypes

Severity              object
State                 object
Temperature(F)       float64
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Sunrise_Sunset        object
year                   int64
date                  object
dtype: object

In [7]:
# Get the number of rows
df.shape[0]

277041

In [8]:
# Drop NA rows
df = df.dropna(axis=0, how="any")
df.head()

Unnamed: 0,Severity,State,Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [9]:
# Check the number of rows again; Data cleanup already dropped NAs
df.shape[0]

277041

In [10]:
# Get count of unique values in the 'Severity' column
print(df['Severity'].value_counts())

short_delay    228852
long_delay      48189
Name: Severity, dtype: int64


In [11]:
# Rename columns to get rid of () with measurement for plotting
df.rename(columns = {'Temperature(F)':'Temperature', 'Wind_Speed(mph)':'Wind_Speed', 'Precipitation(in)':'Precipitation', 'Visibility(mi)': 'Visibility'}, inplace = True)
df.head()                          

Unnamed: 0,Severity,State,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [12]:
# Drop State, year and date columns to reduce factors for NOW
df.drop('State', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.head()

Unnamed: 0,Severity,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
0,long_delay,89.0,10.0,5.0,0.0,clear_weather,Day
1,long_delay,54.0,10.0,5.0,0.0,clear_weather,Day
2,long_delay,41.0,10.0,10.0,0.0,clear_weather,Night
3,long_delay,79.0,10.0,25.0,0.0,bad_weather,Day
4,long_delay,37.0,10.0,0.0,0.0,clear_weather,Night


In [13]:
# Transform Sunrise_Sunset and Weather_Condition to binary input
df.Sunrise_Sunset = df.Sunrise_Sunset.replace({'Day': 1, 'Night': 0}) 
df.Weather_Condition = df.Weather_Condition.replace({'clear_weather': 1, 'bad_weather': 0})
df.head()

Unnamed: 0,Severity,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
0,long_delay,89.0,10.0,5.0,0.0,1,1
1,long_delay,54.0,10.0,5.0,0.0,1,1
2,long_delay,41.0,10.0,10.0,0.0,1,0
3,long_delay,79.0,10.0,25.0,0.0,0,1
4,long_delay,37.0,10.0,0.0,0.0,1,0


In [14]:
# Get count of unique values in the 'Weather_Condition' column
print(df['Weather_Condition'].value_counts())

1    228911
0     48130
Name: Weather_Condition, dtype: int64


In [15]:
# Calculate Correlations
df.corr()

Unnamed: 0,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
Temperature,1.0,0.288617,-0.008902,-0.02855,0.282423,0.296735
Visibility,0.288617,1.0,-0.017753,-0.198338,0.606688,0.057301
Wind_Speed,-0.008902,-0.017753,1.0,0.056201,-0.119258,0.172319
Precipitation,-0.02855,-0.198338,0.056201,1.0,-0.270177,-0.011266
Weather_Condition,0.282423,0.606688,-0.119258,-0.270177,1.0,0.039278
Sunrise_Sunset,0.296735,0.057301,0.172319,-0.011266,0.039278,1.0


## **SPLIT THE DATA INTO TRAINING AND TESTING - PreCOVID YEARS**

In [16]:
# Create our target
y = df["Severity"]
X = df.drop(columns="Severity")

In [17]:
X.describe()

Unnamed: 0,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
count,277041.0,277041.0,277041.0,277041.0,277041.0,277041.0
mean,58.705572,8.871667,6.97308,0.006797,0.826271,0.656267
std,18.713722,3.047073,5.532183,0.047183,0.378877,0.474954
min,-27.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,10.0,3.0,0.0,1.0,0.0
50%,60.0,10.0,6.0,0.0,1.0,1.0
75%,72.0,10.0,10.0,0.0,1.0,1.0
max,113.0,90.0,169.0,9.99,1.0,1.0


In [18]:
# Check the balance of our target values
y.value_counts()

short_delay    228852
long_delay      48189
Name: Severity, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({'short_delay': 171638, 'long_delay': 36142})

### **SMOTE Oversampling - PreCovid Years**

In [20]:
# Resample the training data with SMOTE

from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({'short_delay': 171638, 'long_delay': 171638})

In [22]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [23]:
# Calculated the balanced accuracy score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5650504024291036

In [24]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual long_delay", "Actual short_delay"], columns=["Predicted long_delay", "Predicted short_delay"])
cm_df

Unnamed: 0,Predicted long_delay,Predicted short_delay
Actual long_delay,6398,5649
Actual short_delay,22942,34272


In [25]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 long_delay       0.22      0.53      0.60      0.31      0.56      0.32     12047
short_delay       0.86      0.60      0.53      0.71      0.56      0.32     57214

avg / total       0.75      0.59      0.54      0.64      0.56      0.32     69261

