In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    "Severity", "Start_Time", "State", "Timezone", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", 
    "Stop", "Traffic_Calming", "Traffic_Signal", "Sunrise_Sunset"
]

target = ["Severity"]

In [5]:
# Load the data
file_path = Path('US_Accidents_Dec21_updated.csv', index_col=False)
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night
1,2,2016-02-08 05:56:20,OH,US/Eastern,10.0,,0.02,False,False,False,Night
2,2,2016-02-08 06:15:39,OH,US/Eastern,10.0,,0.02,False,False,False,Night
3,2,2016-02-08 06:51:45,OH,US/Eastern,10.0,,,False,False,False,Night
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day


In [6]:
df.dtypes


Severity               int64
Start_Time            object
State                 object
Timezone              object
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Sunrise_Sunset        object
dtype: object

In [8]:
df = df.dropna(axis=0, how="any")
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day
7,2,2016-02-08 11:51:46,OH,US/Eastern,0.5,3.5,0.08,False,False,False,Day
9,2,2016-02-08 15:16:43,OH,US/Eastern,0.5,3.5,0.05,False,False,False,Day
10,2,2016-02-08 15:43:50,OH,US/Eastern,3.0,4.6,0.03,False,False,False,Day


In [9]:
df.shape[0]

2257531

In [10]:
# Add 'Year' index column from 'Start_Time' column
from datetime import datetime
df['year'] = pd.DatetimeIndex(df['Start_Time']).year
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night,2016
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day,2016
7,2,2016-02-08 11:51:46,OH,US/Eastern,0.5,3.5,0.08,False,False,False,Day,2016
9,2,2016-02-08 15:16:43,OH,US/Eastern,0.5,3.5,0.05,False,False,False,Day,2016
10,2,2016-02-08 15:43:50,OH,US/Eastern,3.0,4.6,0.03,False,False,False,Day,2016


In [11]:
df_2019 = df.loc[(df["year"] == 2019)]
df_2019.head()
 

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,2019-10-02 12:50:00,NJ,US/Eastern,10.0,5.0,0.0,False,False,False,Day,2019
1295810,4,2019-11-01 16:18:00,LA,US/Pacific,10.0,5.0,0.0,False,False,True,Day,2019
1568027,4,2019-06-04 16:00:00.000000000,TX,US/Central,10.0,25.0,0.0,False,False,False,Day,2019
1756843,4,2019-10-19 04:30:00.000000000,VA,US/Eastern,10.0,0.0,0.0,False,False,False,Night,2019
1935372,4,2019-12-17 09:12:52,AZ,US/Mountain,10.0,14.0,0.0,True,False,False,Day,2019


In [12]:
df_2019 = df_2019.drop('Start_Time', axis='columns')
df_2019.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,NJ,US/Eastern,10.0,5.0,0.0,False,False,False,Day,2019
1295810,4,LA,US/Pacific,10.0,5.0,0.0,False,False,True,Day,2019
1568027,4,TX,US/Central,10.0,25.0,0.0,False,False,False,Day,2019
1756843,4,VA,US/Eastern,10.0,0.0,0.0,False,False,False,Night,2019
1935372,4,AZ,US/Mountain,10.0,14.0,0.0,True,False,False,Day,2019


In [13]:
df_2019.shape[0]

200101

In [14]:
df_2019.Traffic_Calming = df.Traffic_Calming.replace({True: 1, False: 0}) 
df_2019.Traffic_Signal = df.Traffic_Signal.replace({True: 1, False: 0}) 
df_2019.Stop = df.Stop.replace({True: 1, False: 0}) 

df_2019.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,NJ,US/Eastern,10.0,5.0,0.0,0,0,0,Day,2019
1295810,4,LA,US/Pacific,10.0,5.0,0.0,0,0,1,Day,2019
1568027,4,TX,US/Central,10.0,25.0,0.0,0,0,0,Day,2019
1756843,4,VA,US/Eastern,10.0,0.0,0.0,0,0,0,Night,2019
1935372,4,AZ,US/Mountain,10.0,14.0,0.0,1,0,0,Day,2019


In [19]:
X = pd.get_dummies(df_2019, columns=['State', 'Timezone', 'Sunrise_Sunset'])
X.head()

Unnamed: 0,Severity,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,year,State_AL,State_AR,...,State_WA,State_WI,State_WV,State_WY,Timezone_US/Central,Timezone_US/Eastern,Timezone_US/Mountain,Timezone_US/Pacific,Sunrise_Sunset_Day,Sunrise_Sunset_Night
582028,4,10.0,5.0,0.0,0,0,0,2019,0,0,...,0,0,0,0,0,1,0,0,1,0
1295810,4,10.0,5.0,0.0,0,0,1,2019,0,0,...,0,0,0,0,0,0,0,1,1,0
1568027,4,10.0,25.0,0.0,0,0,0,2019,0,0,...,0,0,0,0,1,0,0,0,1,0
1756843,4,10.0,0.0,0.0,0,0,0,2019,0,0,...,0,0,0,0,0,1,0,0,0,1
1935372,4,10.0,14.0,0.0,1,0,0,2019,0,0,...,0,0,0,0,0,0,1,0,1,0


# <span style='color:Blue'> <b>2020-2021 COVID YEARS</b> </span>

In [20]:
df_Covid = df.loc[(df["year"] >= 2020)]
df_Covid.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
224945,2,2021-03-10 19:57:00,MA,US/Eastern,10.0,12.0,0.0,False,False,False,Night,2021
224946,2,2021-07-30 23:37:00,CA,US/Pacific,2.0,6.0,0.0,False,False,False,Night,2021
224947,2,2021-10-15 16:42:36,MD,US/Eastern,10.0,9.0,0.0,False,False,False,Day,2021
224948,2,2021-12-21 11:42:00,WA,US/Pacific,10.0,0.0,0.0,False,False,False,Day,2021
224949,2,2021-12-09 08:51:00,CA,US/Pacific,10.0,9.0,0.0,False,False,False,Day,2021


In [21]:
df_Covid = df_Covid.drop('Start_Time', axis='columns')
df_Covid.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
224945,2,MA,US/Eastern,10.0,12.0,0.0,False,False,False,Night,2021
224946,2,CA,US/Pacific,2.0,6.0,0.0,False,False,False,Night,2021
224947,2,MD,US/Eastern,10.0,9.0,0.0,False,False,False,Day,2021
224948,2,WA,US/Pacific,10.0,0.0,0.0,False,False,False,Day,2021
224949,2,CA,US/Pacific,10.0,9.0,0.0,False,False,False,Day,2021


In [23]:
df_Covid.shape[0]

2005591

In [24]:
df_Covid.Traffic_Calming = df.Traffic_Calming.replace({True: 1, False: 0}) 
df_Covid.Traffic_Signal = df.Traffic_Signal.replace({True: 1, False: 0}) 
df_Covid.Stop = df.Stop.replace({True: 1, False: 0}) 

df_Covid.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
224945,2,MA,US/Eastern,10.0,12.0,0.0,0,0,0,Night,2021
224946,2,CA,US/Pacific,2.0,6.0,0.0,0,0,0,Night,2021
224947,2,MD,US/Eastern,10.0,9.0,0.0,0,0,0,Day,2021
224948,2,WA,US/Pacific,10.0,0.0,0.0,0,0,0,Day,2021
224949,2,CA,US/Pacific,10.0,9.0,0.0,0,0,0,Day,2021


In [25]:
X = pd.get_dummies(df_Covid, columns=['State', 'Timezone', 'Sunrise_Sunset'])
X.head()

Unnamed: 0,Severity,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,year,State_AL,State_AR,...,State_WA,State_WI,State_WV,State_WY,Timezone_US/Central,Timezone_US/Eastern,Timezone_US/Mountain,Timezone_US/Pacific,Sunrise_Sunset_Day,Sunrise_Sunset_Night
224945,2,10.0,12.0,0.0,0,0,0,2021,0,0,...,0,0,0,0,0,1,0,0,0,1
224946,2,2.0,6.0,0.0,0,0,0,2021,0,0,...,0,0,0,0,0,0,0,1,0,1
224947,2,10.0,9.0,0.0,0,0,0,2021,0,0,...,0,0,0,0,0,1,0,0,1,0
224948,2,10.0,0.0,0.0,0,0,0,2021,0,0,...,1,0,0,0,0,0,0,1,1,0
224949,2,10.0,9.0,0.0,0,0,0,2021,0,0,...,0,0,0,0,0,0,0,1,1,0
