In [10]:
import pandas as pd
from sklearn.utils import resample

In [7]:
df = pd.read_csv('../data/US_Accidents_March23.csv')
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [None]:
# print rows and cols
df.shape

(7728394, 46)

In [8]:
severity_distribution = df['Severity'].value_counts()
severity_distribution

Severity
2    6156981
3    1299337
4     204710
1      67366
Name: count, dtype: int64

In [14]:
target_size = severity_distribution.min()

def undersample(df, severity, target_size):
    subset = df[df['Severity'] == severity]
    return resample(subset, replace=False, n_samples=target_size, random_state=42)

undersampled_df = pd.concat([undersample(df, severity, target_size) for severity in severity_distribution.index])
undersampled_distribution = undersampled_df['Severity'].value_counts()
undersampled_distribution

Severity
2    67366
3    67366
4    67366
1    67366
Name: count, dtype: int64

In [15]:
# shrink df size to 50mb for testing
sample_df = undersampled_df.sample(frac=0.5, random_state=10)

# fix datetime type
sample_df['Start_Time'] = sample_df['Start_Time'].str.replace(r'\.\d+', '', regex=True)
sample_df['End_Time'] = sample_df['End_Time'].str.replace(r'\.\d+', '', regex=True)
sample_df['Start_Time'] = pd.to_datetime(sample_df['Start_Time'])
sample_df['End_Time'] = pd.to_datetime(sample_df['End_Time'])
sample_df['Weather_Timestamp'] = pd.to_datetime(sample_df['Weather_Timestamp'])

# save to csv
sample_df.to_csv('../data/US_Accidents_March23_undersample.csv', index=False)
sample_df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
6670783,A-6719495,Source1,4,2020-12-12 21:30:00,2020-12-13 00:49:20,37.992617,-77.769182,37.993281,-77.767959,0.081,...,False,False,False,False,False,False,Night,Night,Night,Night
562038,A-565753,Source2,1,2022-07-15 07:46:29,2022-07-15 08:35:22,35.15815,-80.734772,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
3416523,A-3426406,Source1,4,2016-12-15 10:42:57,2016-12-15 16:42:57,38.50928,-90.622916,38.50721,-90.62325,0.144,...,False,False,False,False,True,False,Day,Day,Day,Day
557746,A-561207,Source2,1,2022-07-20 07:18:27,2022-07-20 08:18:16,35.084339,-82.468781,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
5381678,A-5422559,Source1,4,2022-11-04 02:22:00,2022-11-04 03:46:19,33.999802,-118.280993,33.999334,-118.280945,0.032,...,False,False,False,False,False,False,Night,Night,Night,Night


In [12]:
# print sample rows and cols
sample_df.shape

(131383, 46)