In [106]:
import numpy as np
import pandas as pd

df = pd.read_csv("US_Accidents_Dec21_updated.csv")

In [107]:
# Check how many values are missing in each column
df.isnull().sum()

ID                             0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                        0
End_Lng                        0
Distance(mi)                   0
Description                    0
Number                   1743911
Street                         2
Side                           0
City                         137
County                         0
State                          0
Zipcode                     1319
Country                        0
Timezone                    3659
Airport_Code                9549
Weather_Timestamp          50736
Temperature(F)             69274
Wind_Chill(F)             469643
Humidity(%)                73092
Pressure(in)               59200
Visibility(mi)             70546
Wind_Direction             73775
Wind_Speed(mph)           157944
Precipitation(in)         549458
Weather_Condition          70636
Amenity   

In [108]:
# Drop unnecessary columns
df = df.drop(columns=['Number', 'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight', 'Street', 'City', 
                      'Zipcode', 'Timezone','ID', 'End_Lat', 'End_Lng', 'Country', 'Airport_Code', 
                      'Weather_Timestamp', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Wind_Direction', 
                      'Wind_Speed(mph)'])

In [111]:
df.head()

Unnamed: 0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Side,County,State,Temperature(F),Visibility(mi),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Civil_Twilight
0,3,2016-02-08 00:37:08,2016-02-08 06:37:08,40.10891,-83.09286,3.23,Between Sawmill Rd/Exit 20 and OH-315/Olentang...,R,Franklin,OH,42.1,10.0,0.0,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night
1,2,2016-02-08 05:56:20,2016-02-08 11:56:20,39.86542,-84.0628,0.747,At OH-4/OH-235/Exit 41 - Accident.,R,Montgomery,OH,36.9,10.0,0.02,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night
2,2,2016-02-08 06:15:39,2016-02-08 12:15:39,39.10266,-84.52468,0.055,At I-71/US-50/Exit 1 - Accident.,R,Hamilton,OH,36.0,10.0,0.02,Overcast,False,False,False,False,True,False,False,False,False,False,False,False,False,Night
3,2,2016-02-08 06:51:45,2016-02-08 12:51:45,41.06213,-81.53784,0.123,At Dart Ave/Exit 21 - Accident.,R,Summit,OH,39.0,10.0,,Overcast,False,False,False,False,False,False,False,False,False,False,False,False,False,Night
4,3,2016-02-08 07:53:43,2016-02-08 13:53:43,39.172393,-84.492792,0.5,At Mitchell Ave/Exit 6 - Accident.,R,Hamilton,OH,37.0,10.0,0.01,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Day


In [110]:
precip_missing = df[df['Precipitation(in)'].isnull()]
precip_missing['Weather_Condition'].value_counts()
# Looks like the reason precipitation is missing in some rows is because there was no accumulated precipitation 
# during the time of the accident.

Clear                           172786
Overcast                         72357
Mostly Cloudy                    72029
Partly Cloudy                    51271
Fair                             45103
Scattered Clouds                 44052
Cloudy                            8959
Haze                              6675
Light Rain                        3908
Fog                               3230
Rain                              2159
Light Snow                        2115
Smoke                             1170
Thunderstorm                       727
Light Drizzle                      609
Fair / Windy                       474
Mist                               454
Light Freezing Fog                 350
Snow                               341
Patches of Fog                     292
Drizzle                            281
Heavy Rain                         243
Cloudy / Windy                     177
Showers in the Vicinity            156
Light Thunderstorms and Rain       149
Shallow Fog              

In [118]:
# We can replace the null values in the precipitation column with 0
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(0.0)

In [119]:
# Now we can drop the rest of the columns with missing values.
df = df.dropna()
df.isnull().sum()

Severity             0
Start_Time           0
End_Time             0
Start_Lat            0
Start_Lng            0
Distance(mi)         0
Description          0
Side                 0
County               0
State                0
Temperature(F)       0
Visibility(mi)       0
Precipitation(in)    0
Weather_Condition    0
Amenity              0
Bump                 0
Crossing             0
Give_Way             0
Junction             0
No_Exit              0
Railway              0
Roundabout           0
Station              0
Stop                 0
Traffic_Calming      0
Traffic_Signal       0
Turning_Loop         0
Civil_Twilight       0
dtype: int64

In [141]:
# There are over 100 types of weather conditions. We only want the most popular. Additionally, some of them
# mean the same thing ("Rain Shower" and "Rain Showers", for example.)

condition_value_counts = df['Weather_Condition'].value_counts()

# Fair <- clear, fair / windy
# Cloudy <- overcast, cloudy / windy, Mostly Cloudy, Mostly Cloudy / Windy
# Partly Cloudy <- Scattered Clouds, Partly Cloudy / Windy
# Fog <- haze, patches of fog, mist, shallow fog, haze / windy, fog / windy

In [151]:
# Drop all columns with weather conditions under 600 occurrences
condition_value_counts = condition_value_counts[condition_value_counts > 600]
condition_value_counts

Fair                            1097777
Mostly Cloudy                    362226
Cloudy                           346177
Partly Cloudy                    248337
Clear                            172509
Light Rain                       127827
Overcast                          84172
Scattered Clouds                  44840
Light Snow                        43480
Fog                               40946
Haze                              36070
Rain                              30812
Fair / Windy                      14983
Heavy Rain                        11712
Smoke                              7171
Light Drizzle                      6989
Thunder in the Vicinity            6856
Cloudy / Windy                     6731
T-Storm                            6514
Mostly Cloudy / Windy              6277
Thunder                            5968
Light Rain with Thunder            5271
Snow                               5237
Partly Cloudy / Windy              3847
Wintry Mix                         3833


In [154]:
df[df['Weather_Condition'] == condition_value_counts.index.all()]

Unnamed: 0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Side,County,State,Temperature(F),Visibility(mi),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Civil_Twilight
210242,2,2017-06-12 13:22:13,2017-06-12 19:22:13,40.745721,-112.649028,0.518,At Skull Valley Rd/Exit 77 - Accident.,R,Tooele,UT,63.0,10.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Day
225988,2,2021-12-13 13:34:00,2021-12-13 14:12:30,37.486649,-122.14223,5.277,Stationary traffic on CA-84 E - Bayfront Expy ...,R,San Mateo,CA,61.0,7.0,0.0,Showers in the Vicinity,False,False,True,False,False,False,False,False,False,False,False,True,False,Day
228259,2,2021-11-10 15:01:00,2021-11-10 16:16:11,28.062276,-82.000811,0.215,Incident on I-4 EB near MM 29 Drive with caution.,R,Polk,FL,73.0,10.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Day
228964,2,2021-12-30 11:57:00,2021-12-30 14:01:38,33.836531,-117.331483,0.025,Accident from Harley John Rd (Cajalco Rd) to W...,R,Riverside,CA,59.0,10.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Day
241447,2,2021-09-06 13:32:00.000000000,2021-09-06 14:47:17.000000000,27.953757,-81.968811,0.147,Incident on S FLORIDA AVE near CR-540A Drive w...,L,Polk,FL,73.0,10.0,0.0,Showers in the Vicinity,False,False,True,False,False,False,False,False,False,False,False,True,False,Day
241725,2,2021-11-05 16:27:00.000000000,2021-11-05 17:43:01.000000000,26.935247,-80.153964,0.283,Incident on FL-9 NB near INDIANTOWN RD Drive w...,R,Palm Beach,FL,75.0,7.0,0.0,Showers in the Vicinity,False,False,False,False,True,False,False,False,False,False,False,False,False,Day
244899,2,2021-09-24 12:52:30,2021-09-24 13:30:30,27.125305,-80.280079,0.913,Slow traffic on FL-91 N - Florida's Tpke N fro...,R,Martin,FL,79.0,7.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Day
246748,2,2021-08-24 21:43:00.000000000,2021-08-25 01:58:53.000000000,28.054821,-81.965834,0.02,Incident on W MEMORIAL BLVD near N LINCOLN AVE...,L,Polk,FL,77.0,10.0,0.0,Showers in the Vicinity,False,False,True,False,False,False,False,False,False,False,False,True,False,Night
250749,2,2021-09-24 20:37:00,2021-09-24 21:52:46,26.980078,-80.183293,0.406,Incident on I-95 SB near MM 90 Drive with caut...,R,Martin,FL,79.0,7.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Night
253253,2,2021-04-23 13:26:30,2021-04-23 15:32:30,39.064048,-104.855627,1.886,Slow traffic on US-85 N - US-87 N - I-25 N fro...,R,El Paso,CO,52.0,10.0,0.0,Showers in the Vicinity,False,False,False,False,False,False,False,False,False,False,False,False,False,Day
