In [64]:
import pandas as pd
from sklearn.utils import resample

In [65]:
df = pd.read_csv('../data/US_Accidents_March23.csv')
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [66]:
# print rows and cols
df.shape

(7728394, 46)

In [67]:
# fix datetime type
df['Start_Time'] = df['Start_Time'].str.replace(r'\.\d+', '', regex=True)
df['End_Time'] = df['End_Time'].str.replace(r'\.\d+', '', regex=True)
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])
df['Weather_Timestamp'] = pd.to_datetime(df['Weather_Timestamp'])

In [68]:
# save random 100000 rows for web app
df.sample(n=100000, random_state=42).to_csv('../data/US_Accidents_March23_random_sample.csv', index=False)

In [69]:
missing_values = pd.DataFrame(df.isnull().sum(), columns=['Missing_Count']).reset_index()
missing_values.columns = ['Feature', 'Missing_Percent(%)']
missing_values['Missing_Percent(%)'] = missing_values['Missing_Percent(%)'] / df.shape[0] * 100
missing_values = missing_values[missing_values['Missing_Percent(%)'] > 0]
missing_values

Unnamed: 0,Feature,Missing_Percent(%)
7,End_Lat,44.029355
8,End_Lng,44.029355
10,Description,6.5e-05
11,Street,0.140637
12,City,0.003274
15,Zipcode,0.024779
17,Timezone,0.10103
18,Airport_Code,0.292881
19,Weather_Timestamp,1.555666
20,Temperature(F),2.120143


In [70]:
df = df.dropna(subset=['Precipitation(in)'])

In [71]:
severity_distribution = df['Severity'].value_counts()
severity_distribution

Severity
2    4667015
3     658955
4     135388
1      63450
Name: count, dtype: int64

In [72]:
target_size = severity_distribution.min()

def undersample(df, severity, target_size):
    subset = df[df['Severity'] == severity]
    return resample(subset, replace=False, n_samples=target_size, random_state=42)

undersampled_df = pd.concat([undersample(df, severity, target_size) for severity in severity_distribution.index])
undersampled_distribution = undersampled_df['Severity'].value_counts()
undersampled_distribution

Severity
2    63450
3    63450
4    63450
1    63450
Name: count, dtype: int64

In [77]:
# save to csv
undersampled_df.to_csv('../data/US_Accidents_March23_undersample.csv', index=False)
undersampled_df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
5325586,A-5366042,Source1,2,2022-02-09 15:30:35,2022-02-09 17:47:21,42.744212,-73.777739,42.733496,-73.788485,0.92,...,False,False,False,False,False,False,Day,Day,Day,Day
3959397,A-3989112,Source1,2,2022-03-08 15:06:07,2022-03-08 16:42:56,39.882025,-102.243444,39.878531,-102.246944,0.304,...,False,False,False,False,False,False,,,,
4384365,A-4417542,Source1,2,2022-05-02 06:23:30,2022-05-02 19:25:30,33.866554,-117.998246,33.892132,-118.043715,3.151,...,False,False,False,False,False,False,Day,Day,Day,Day
4780151,A-4816426,Source1,2,2022-07-02 19:31:34,2022-07-02 20:16:34,39.090377,-94.804167,39.091236,-94.805199,0.081,...,False,False,False,False,False,False,Day,Day,Day,Day
2246947,A-2256814,Source2,2,2019-01-31 10:05:24,2019-01-31 10:34:50,34.47356,-120.205147,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day


In [78]:
# print sample rows and cols
undersampled_df.shape

(253800, 46)