In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

Load Dataset

In [21]:
df = pd.read_csv('../data/US_Accidents_March23_sample.csv')

Handle Missing Values

In [22]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [23]:
num_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)']
cat_cols = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']

In [24]:
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

Feature Encoding

In [25]:
encoder = OneHotEncoder(sparse=False)
encoded_columns = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(cat_cols))
df = df.drop(cat_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)



Normalization of Numerical Data

In [26]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

Feature Engineering

In [27]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year

Removing outliers using IQR

In [28]:
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [29]:
df.to_csv('../data/US_Accidents_March23_sample_preprocessed.csv', index=False)