In [45]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

Load Dataset

In [46]:
df = pd.read_csv('../data/US_Accidents_March23_sample.csv')
df.head(10)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-227993,Source2,2,2016-05-18 16:28:12,2016-05-18 16:58:12,39.81562,-82.822304,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
1,A-5584682,Source1,2,2021-10-17 00:25:30,2021-10-17 00:52:30,35.110082,-80.97522,35.107982,-80.978715,0.245,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-309630,Source2,2,2016-11-13 16:55:24,2016-11-13 17:40:24,47.234718,-122.487633,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
3,A-3580182,Source1,2,2017-03-15 09:21:23,2017-03-15 15:21:23,43.17657,-71.61108,43.177915,-71.624505,0.683,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-6523880,Source1,2,2021-02-12 04:27:00,2021-02-12 08:44:00,41.944932,-83.555199,41.945482,-83.559469,0.223,...,False,False,False,False,False,False,Night,Night,Night,Night
5,A-7100647,Source1,2,2020-06-19 16:26:01,2020-06-19 16:55:16,43.61386,-116.26264,43.61386,-116.26264,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
6,A-4190196,Source1,2,2022-02-22 09:02:46,2022-02-22 11:34:23,25.732933,-80.340203,25.733035,-80.335226,0.31,...,False,True,False,False,False,False,Day,Day,Day,Day
7,A-5962252,Source1,2,2021-08-28 21:19:00.000000000,2021-08-28 23:32:17.000000000,32.786155,-96.812695,32.783963,-96.813574,0.16,...,False,False,False,False,False,False,Night,Night,Night,Day
8,A-4658270,Source1,2,2022-06-19 10:50:00,2022-06-19 12:08:23,41.157002,-76.797775,41.175349,-76.791129,1.314,...,False,False,False,False,False,False,Day,Day,Day,Day
9,A-2099924,Source2,2,2019-05-13 06:15:11,2019-05-13 07:47:15,34.05426,-118.237823,,,0.0,...,False,False,False,False,True,False,Day,Day,Day,Day


Handle Missing Values

In [47]:
num_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)']
cat_cols = ['Severity', 'Country', 'Timezone', 'Weather_Condition', 'Sunrise_Sunset', 
            'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
            'Amenity', 'Bump', 'Crossing', 'Junction', 
            'Railway', 'Roundabout', 'Stop',
            'Traffic_Signal', 'Turning_Loop']

In [48]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

Feature Encoding

In [49]:
df_encoded = pd.get_dummies(df[cat_cols], drop_first=True)
df = pd.concat([df.drop(cat_cols, axis=1), df_encoded], axis=1)

  df_encoded = pd.get_dummies(df[cat_cols], drop_first=True)


Normalization of Numerical Data

In [50]:
for column in num_cols:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

Feature Engineering

In [51]:
df['Start_Time'] = df['Start_Time'].str.replace(r'\.\d+', '', regex=True)
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year

Removing outliers using IQR

In [52]:
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [53]:
df.head(10)

Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Amenity_True,Bump_True,Crossing_True,Junction_True,Railway_True,Roundabout_True,Stop_True,Traffic_Signal_True,Month,Year
0,A-227993,Source2,2016-05-18 16:28:12,2016-05-18 16:58:12,0.712455,0.684889,-1.803202e-15,0.0,-0.335002,Accident on County Hwy-39 Lithopolis Rd at OH-...,...,0,0,0,0,0,0,0,0,5,2016
1,A-5584682,Source1,2021-10-17 00:25:30,2021-10-17 00:52:30,-0.215829,0.791081,-0.2952821,1.092613,-0.185739,Stationary traffic on NC-49 from Carowinds Blv...,...,0,0,0,0,0,0,0,0,10,2021
2,A-309630,Source2,2016-11-13 16:55:24,2016-11-13 17:40:24,2.176056,-1.595559,-1.803202e-15,0.0,-0.32891,Accident on WA-16 Eastbound at Union Ave.,...,0,0,0,0,0,0,0,0,11,2016
7,A-5962252,Source1,2021-08-28 21:19:00,2021-08-28 23:32:17.000000000,-0.674281,-0.11945,-0.8850675,-0.076388,-0.237524,Slow traffic from Northeast Dallas to N Riverf...,...,0,0,0,1,0,0,0,0,8,2021
9,A-2099924,Source2,2019-05-13 06:15:11,2019-05-13 07:47:15,-0.424116,-1.351228,-1.803202e-15,0.0,-0.335002,#3 and #4 lane blocked due to recovery work an...,...,0,0,1,0,0,0,0,1,5,2019
10,A-4170357,Source1,2022-11-11 13:04:18,2022-11-11 13:24:51,0.859949,1.173516,1.087502,1.584218,-0.037085,Slow traffic on Garden State Pkwy S from Middl...,...,0,0,0,0,0,0,0,0,11,2022
11,A-2020562,Source2,2019-08-13 17:31:14,2019-08-13 19:07:32,-0.114534,-0.163869,-1.803202e-15,0.0,-0.335002,Accident on 150th St Eastbound at Portland Ave.,...,0,0,1,0,0,0,0,1,8,2019
12,A-3236519,Source2,2017-10-10 13:50:21,2017-10-10 14:34:59,-0.652611,-0.124364,-1.803202e-15,0.0,-0.335002,Accident on I-35E Northbound at Exit 439 Royal...,...,0,0,0,0,0,0,0,0,10,2017
15,A-275942,Source2,2016-11-01 22:19:48,2016-11-01 22:49:36,-1.307138,-0.209131,-1.803202e-15,0.0,-0.32891,Accident on FM-2252 Nacogdoches Rd at Judson Rd.,...,0,0,0,0,0,0,0,1,11,2016
16,A-7620651,Source1,2018-02-16 07:14:04,2018-02-16 13:14:04,0.584238,1.034865,0.7366226,1.406481,0.10243,At MD-100 - Accident. Left lane blocked.,...,0,0,0,0,0,0,0,0,2,2018
