In [46]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

Load Dataset

In [60]:
df = pd.read_csv('../data/US_Accidents_March23_sample.csv')
df.head(10)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-227993,Source2,2,2016-05-18 16:28:12,2016-05-18 16:58:12,39.81562,-82.822304,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
1,A-5584682,Source1,2,2021-10-17 00:25:30,2021-10-17 00:52:30,35.110082,-80.97522,35.107982,-80.978715,0.245,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-309630,Source2,2,2016-11-13 16:55:24,2016-11-13 17:40:24,47.234718,-122.487633,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
3,A-3580182,Source1,2,2017-03-15 09:21:23,2017-03-15 15:21:23,43.17657,-71.61108,43.177915,-71.624505,0.683,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-6523880,Source1,2,2021-02-12 04:27:00,2021-02-12 08:44:00,41.944932,-83.555199,41.945482,-83.559469,0.223,...,False,False,False,False,False,False,Night,Night,Night,Night
5,A-7100647,Source1,2,2020-06-19 16:26:01,2020-06-19 16:55:16,43.61386,-116.26264,43.61386,-116.26264,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
6,A-4190196,Source1,2,2022-02-22 09:02:46,2022-02-22 11:34:23,25.732933,-80.340203,25.733035,-80.335226,0.31,...,False,True,False,False,False,False,Day,Day,Day,Day
7,A-5962252,Source1,2,2021-08-28 21:19:00.000000000,2021-08-28 23:32:17.000000000,32.786155,-96.812695,32.783963,-96.813574,0.16,...,False,False,False,False,False,False,Night,Night,Night,Day
8,A-4658270,Source1,2,2022-06-19 10:50:00,2022-06-19 12:08:23,41.157002,-76.797775,41.175349,-76.791129,1.314,...,False,False,False,False,False,False,Day,Day,Day,Day
9,A-2099924,Source2,2,2019-05-13 06:15:11,2019-05-13 07:47:15,34.05426,-118.237823,,,0.0,...,False,False,False,False,True,False,Day,Day,Day,Day


Handle Missing Values

In [48]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [49]:
num_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)']
cat_cols = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']

In [50]:
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

Feature Encoding

In [51]:
encoder = OneHotEncoder(sparse_output=False)
encoded_columns = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(cat_cols))
df = df.drop(cat_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)

Normalization of Numerical Data

In [52]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

Feature Engineering

In [61]:
df['Start_Time'] = df['Start_Time'].str.replace(r'\.\d+', '', regex=True)
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year

Removing outliers using IQR

In [62]:
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [64]:
df.head(10)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Month,Year
0,A-227993,Source2,2,2016-05-18 16:28:12,2016-05-18 16:58:12,39.81562,-82.822304,,,0.0,...,False,False,False,False,Day,Day,Day,Day,5,2016
1,A-5584682,Source1,2,2021-10-17 00:25:30,2021-10-17 00:52:30,35.110082,-80.97522,35.107982,-80.978715,0.245,...,False,False,False,False,Night,Night,Night,Night,10,2021
2,A-309630,Source2,2,2016-11-13 16:55:24,2016-11-13 17:40:24,47.234718,-122.487633,,,0.01,...,False,False,False,False,Night,Day,Day,Day,11,2016
3,A-3580182,Source1,2,2017-03-15 09:21:23,2017-03-15 15:21:23,43.17657,-71.61108,43.177915,-71.624505,0.683,...,False,False,False,False,Day,Day,Day,Day,3,2017
4,A-6523880,Source1,2,2021-02-12 04:27:00,2021-02-12 08:44:00,41.944932,-83.555199,41.945482,-83.559469,0.223,...,False,False,False,False,Night,Night,Night,Night,2,2021
5,A-7100647,Source1,2,2020-06-19 16:26:01,2020-06-19 16:55:16,43.61386,-116.26264,43.61386,-116.26264,0.0,...,False,False,False,False,Day,Day,Day,Day,6,2020
6,A-4190196,Source1,2,2022-02-22 09:02:46,2022-02-22 11:34:23,25.732933,-80.340203,25.733035,-80.335226,0.31,...,False,False,False,False,Day,Day,Day,Day,2,2022
7,A-5962252,Source1,2,2021-08-28 21:19:00,2021-08-28 23:32:17.000000000,32.786155,-96.812695,32.783963,-96.813574,0.16,...,False,False,False,False,Night,Night,Night,Day,8,2021
9,A-2099924,Source2,2,2019-05-13 06:15:11,2019-05-13 07:47:15,34.05426,-118.237823,,,0.0,...,False,False,True,False,Day,Day,Day,Day,5,2019
10,A-4170357,Source1,2,2022-11-11 13:04:18,2022-11-11 13:24:51,40.563274,-74.323283,40.556774,-74.319617,0.489,...,False,False,False,False,Day,Day,Day,Day,11,2022
