In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    "Severity", "Start_Time", "State", "Timezone", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", 
    "Stop", "Traffic_Calming", "Traffic_Signal", "Sunrise_Sunset"
]

target = ["Severity"]

In [5]:
# Load the data
file_path = Path('US_Accidents_Dec21_updated.csv', index_col=False)
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night
1,2,2016-02-08 05:56:20,OH,US/Eastern,10.0,,0.02,False,False,False,Night
2,2,2016-02-08 06:15:39,OH,US/Eastern,10.0,,0.02,False,False,False,Night
3,2,2016-02-08 06:51:45,OH,US/Eastern,10.0,,,False,False,False,Night
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day


In [6]:
df.dtypes


Severity               int64
Start_Time            object
State                 object
Timezone              object
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Sunrise_Sunset        object
dtype: object

In [8]:
df = df.dropna(axis=0, how="any")
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day
7,2,2016-02-08 11:51:46,OH,US/Eastern,0.5,3.5,0.08,False,False,False,Day
9,2,2016-02-08 15:16:43,OH,US/Eastern,0.5,3.5,0.05,False,False,False,Day
10,2,2016-02-08 15:43:50,OH,US/Eastern,3.0,4.6,0.03,False,False,False,Day


In [9]:
df.shape[0]

2257531

In [10]:
# Add 'Year' index column from 'Start_Time' column
from datetime import datetime
df['year'] = pd.DatetimeIndex(df['Start_Time']).year
df.head()

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
0,3,2016-02-08 00:37:08,OH,US/Eastern,10.0,10.4,0.0,False,False,False,Night,2016
4,3,2016-02-08 07:53:43,OH,US/Eastern,10.0,10.4,0.01,False,False,False,Day,2016
7,2,2016-02-08 11:51:46,OH,US/Eastern,0.5,3.5,0.08,False,False,False,Day,2016
9,2,2016-02-08 15:16:43,OH,US/Eastern,0.5,3.5,0.05,False,False,False,Day,2016
10,2,2016-02-08 15:43:50,OH,US/Eastern,3.0,4.6,0.03,False,False,False,Day,2016


In [11]:
df_2019 = df.loc[(df["year"] == 2019)]
df_2019.head()
 

Unnamed: 0,Severity,Start_Time,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,2019-10-02 12:50:00,NJ,US/Eastern,10.0,5.0,0.0,False,False,False,Day,2019
1295810,4,2019-11-01 16:18:00,LA,US/Pacific,10.0,5.0,0.0,False,False,True,Day,2019
1568027,4,2019-06-04 16:00:00.000000000,TX,US/Central,10.0,25.0,0.0,False,False,False,Day,2019
1756843,4,2019-10-19 04:30:00.000000000,VA,US/Eastern,10.0,0.0,0.0,False,False,False,Night,2019
1935372,4,2019-12-17 09:12:52,AZ,US/Mountain,10.0,14.0,0.0,True,False,False,Day,2019


In [12]:
df_2019 = df_2019.drop('Start_Time', axis='columns')
df_2019.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,NJ,US/Eastern,10.0,5.0,0.0,False,False,False,Day,2019
1295810,4,LA,US/Pacific,10.0,5.0,0.0,False,False,True,Day,2019
1568027,4,TX,US/Central,10.0,25.0,0.0,False,False,False,Day,2019
1756843,4,VA,US/Eastern,10.0,0.0,0.0,False,False,False,Night,2019
1935372,4,AZ,US/Mountain,10.0,14.0,0.0,True,False,False,Day,2019


In [13]:
df_2019.shape[0]

200101

In [14]:
df_2019.Traffic_Calming = df.Traffic_Calming.replace({True: 1, False: 0}) 
df_2019.Traffic_Signal = df.Traffic_Signal.replace({True: 1, False: 0}) 
df_2019.Stop = df.Stop.replace({True: 1, False: 0}) 
#df_2019.Sunrise_Sunset = df.Sunrise_Sunset.replace({True:, False: 0})
df_2019.head()

Unnamed: 0,Severity,State,Timezone,Visibility(mi),Wind_Speed(mph),Precipitation(in),Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,year
582028,4,NJ,US/Eastern,10.0,5.0,0.0,0,0,0,Day,2019
1295810,4,LA,US/Pacific,10.0,5.0,0.0,0,0,1,Day,2019
1568027,4,TX,US/Central,10.0,25.0,0.0,0,0,0,Day,2019
1756843,4,VA,US/Eastern,10.0,0.0,0.0,0,0,0,Night,2019
1935372,4,AZ,US/Mountain,10.0,14.0,0.0,1,0,0,Day,2019


In [15]:
print(df_2019['State'].unique())

['NJ' 'LA' 'TX' 'VA' 'AZ' 'OR' 'NC' 'MN' 'IL' 'ID' 'CA' 'VT' 'MI' 'MA'
 'RI' 'NY' 'NH' 'PA' 'ME' 'MD' 'KY' 'CT' 'DC' 'TN' 'SC' 'GA' 'FL' 'WI'
 'IA' 'NE' 'MS' 'AR' 'CO' 'UT' 'NM' 'WA' 'NV' 'IN' 'AL' 'MO' 'KS' 'WV'
 'WY' 'DE' 'OH' 'OK' 'MT' 'SD' 'ND']


In [None]:
import re
state_array = np.array(['NJ' 'LA' 'TX' 'VA' 'AZ' 'OR' 'PA' 'MD' 'NC' 'MN' 'IL' 'ID' 'CA' 'VT'
 'MI' 'MA' 'RI' 'NY' 'NH' 'ME' 'KY' 'IN' 'CT' 'DC' 'TN' 'SC' 'GA' 'FL'
 'WI' 'IA' 'NE' 'MS' 'AR' 'UT' 'CO' 'NM' 'WA' 'NV' 'AL' 'MO' 'KS' 'WV'
 'WY' 'DE' 'OH' 'OK' 'MT' 'SD' 'ND'])
split_state_array = np.array(re.split("\s+", state_array.replace('[','').replace(']','')), dtype=str)
print(split_state_array)
sorted_state_array = np.sort(split_state_array)
print(sorted_state_array)