# US Accidents - Severity Prediction

## Team Members
- Aditya Kamble
- Sidharth Panda

## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


## Data Preparation

In [2]:
data_df = pd.read_csv('data/US_Accidents_Dec19.csv')

In [3]:
data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [4]:
data_df.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [5]:
data_df.describe()

Unnamed: 0,TMC,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Number,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
count,2246264.0,2974335.0,2974335.0,2974335.0,728071.0,728071.0,2974335.0,1056730.0,2918272.0,1121712.0,2915162.0,2926193.0,2908644.0,2533495.0,975977.0
mean,207.8316,2.36019,36.49361,-95.42625,37.580871,-99.976032,0.2855654,5837.004,62.3512,51.32685,65.40542,29.8319,9.15077,8.298064,0.020495
std,20.32959,0.5414733,4.918849,17.21881,5.004757,18.416647,1.548392,15159.28,18.78855,25.19127,22.55676,0.7213808,2.892114,5.138546,0.23577
min,200.0,1.0,24.55527,-124.6238,24.57011,-124.497829,0.0,0.0,-77.8,-65.9,1.0,0.0,0.0,0.0,0.0
25%,201.0,2.0,33.5504,-117.292,33.957554,-118.28661,0.0,837.0,50.0,32.0,49.0,29.82,10.0,4.6,0.0
50%,201.0,2.0,35.84969,-90.25083,37.90367,-96.63169,0.0,2717.0,64.4,54.0,67.0,29.98,10.0,7.0,0.0
75%,201.0,3.0,40.37026,-80.91891,41.37263,-82.32385,0.01,7000.0,76.0,73.0,84.0,30.11,10.0,10.4,0.0
max,406.0,4.0,49.0022,-67.11317,49.075,-67.109242,333.63,9999997.0,170.6,115.0,100.0,33.04,140.0,822.8,25.0


In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974335 entries, 0 to 2974334
Data columns (total 49 columns):
ID                       object
Source                   object
TMC                      float64
Severity                 int64
Start_Time               object
End_Time                 object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description              object
Number                   float64
Street                   object
Side                     object
City                     object
County                   object
State                    object
Zipcode                  object
Country                  object
Timezone                 object
Airport_Code             object
Weather_Timestamp        object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi

## Feature Preprocessing

In [7]:
data_df['Start_Time'] = pd.to_datetime(data_df['Start_Time'], errors='coerce')
data_df['End_Time'] = pd.to_datetime(data_df['End_Time'], errors='coerce')

In [8]:
# Extract year, month, day, hour and weekday

data_df['Year'] = data_df['Start_Time'].dt.year
data_df['Month'] = data_df['Start_Time'].dt.strftime('%b')
data_df['Day'] = data_df['Start_Time'].dt.day
data_df['Hour'] = data_df['Start_Time'].dt.hour
data_df['Weekday'] = data_df['Start_Time'].dt.strftime('%a')

**Time in the unit of minutes for each accident**

In [9]:
time_duration = 'Time_Duration_Min'

In [10]:
data_df[time_duration] = round((data_df['End_Time'] - data_df['Start_Time'])/np.timedelta64(1,'m'))

In [11]:
data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Weekday,Time_Duration_Min
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,Night,Night,Night,Night,2016,Feb,8,5,Mon,314.0
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,Night,Night,Night,Day,2016,Feb,8,6,Mon,30.0
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,Night,Night,Day,Day,2016,Feb,8,6,Mon,30.0
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,Night,Day,Day,Day,2016,Feb,8,7,Mon,30.0
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,Day,Day,Day,Day,2016,Feb,8,7,Mon,30.0


In [12]:
data_df.tail()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Weekday,Time_Duration_Min
2974330,A-2974354,Bing,,2,2019-08-23 18:03:25,2019-08-23 18:32:01,34.00248,-117.37936,33.99888,-117.37094,...,Day,Day,Day,Day,2019,Aug,23,18,Fri,29.0
2974331,A-2974355,Bing,,2,2019-08-23 19:11:30,2019-08-23 19:38:23,32.76696,-117.14806,32.76555,-117.15363,...,Day,Day,Day,Day,2019,Aug,23,19,Fri,27.0
2974332,A-2974356,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:28:49,33.77545,-117.84779,33.7774,-117.85727,...,Day,Day,Day,Day,2019,Aug,23,19,Fri,28.0
2974333,A-2974357,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:29:42,33.99246,-118.40302,33.98311,-118.39565,...,Day,Day,Day,Day,2019,Aug,23,19,Fri,29.0
2974334,A-2974358,Bing,,2,2019-08-23 18:52:06,2019-08-23 19:21:31,34.13393,-117.23092,34.13736,-117.23934,...,Day,Day,Day,Day,2019,Aug,23,18,Fri,29.0


### Remove Entries with negative time duration

In [13]:
neg_outliers = data_df[time_duration] <= 0
data_df[neg_outliers] = np.nan
data_df.dropna(subset=[time_duration],axis=0,inplace=True)

In [14]:
features_to_use = ['TMC','Severity','Start_Lng','Start_Lat','Distance(mi)','Side','City','County','State',
                   'Timezone','Temperature(F)','Humidity(%)','Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
                   'Weather_Condition','Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway',
                   'Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop','Sunrise_Sunset',
                   'Hour','Weekday', 'Time_Duration_Min']

In [15]:
features_imp = ['TMC',
                'Start_Lng',
                'Start_Lat',
                'Distance(mi)',
                'Temperature(F)',
                'Humidity(%)',
                'Pressure(in)',
                'Hour',
                'State',
                'Severity',
                'Time_Duration_Min']

In [16]:
data_df_ml = data_df[features_imp].copy()

In [17]:
data_df_ml.dropna(subset=data_df_ml.columns[data_df_ml.isnull().mean()!=0], how='any', axis=0, inplace=True)

In [18]:
data_df_ml.head()

Unnamed: 0,TMC,Start_Lng,Start_Lat,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Hour,State,Severity,Time_Duration_Min
0,201.0,-84.058723,39.865147,0.01,36.9,91.0,29.68,5.0,OH,3.0,314.0
1,201.0,-82.831184,39.928059,0.01,37.9,100.0,29.65,6.0,OH,2.0,30.0
2,201.0,-84.032608,39.063148,0.01,36.0,100.0,29.67,6.0,OH,2.0,30.0
3,201.0,-84.205582,39.747753,0.01,35.1,96.0,29.64,7.0,OH,3.0,30.0
4,201.0,-84.188354,39.627781,0.01,36.0,89.0,29.65,7.0,OH,2.0,30.0


## Modelling

Note - Processing all states at a time is resource heavy using normal data preprocessing. Using PySpark would resolve the issue

In [19]:
target = 'Severity'

In [20]:
data_df_state = data_df_ml[data_df_ml['State'] == 'CA']

In [21]:
data_df_state_en = pd.get_dummies(data_df_state, drop_first=True)

In [22]:
y = data_df_state_en[target]
X = data_df_state_en.drop(target, axis=1)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

### Featuer Selection

In [24]:
random_forest_classifier = RandomForestClassifier(n_estimators=10)
random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
y_pred = random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [26]:
accuracy

0.8883131113672941