# US Accidents - Severity Prediction

## Team Members
- Aditya Kamble
- Sidharth Panda

## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

  from numpy.core.umath_tests import inner1d


In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

## Data Preparation

In [5]:
%%bigquery data_df
select * from `kbs-2020.usaccidents.accidents`

In [6]:
data_df.shape

(2974335, 49)

In [8]:
data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-3418,MapQuest,222.0,2,2016-07-18 17:05:24+00:00,2016-07-18 18:20:24+00:00,38.932369,-121.090813,,,...,False,False,False,False,True,False,Day,Day,Day,Day
1,A-25968,MapQuest,247.0,2,2016-08-05 12:42:34+00:00,2016-08-05 13:27:34+00:00,37.788002,-122.393623,,,...,False,False,False,False,True,False,Day,Day,Day,Day
2,A-28018,MapQuest,222.0,3,2016-08-19 20:53:29+00:00,2016-08-19 22:08:29+00:00,38.773979,-121.241997,,,...,False,False,False,False,False,False,Night,Night,Day,Day
3,A-38797,MapQuest,248.0,2,2016-05-17 18:25:22+00:00,2016-05-17 18:55:22+00:00,37.800373,-122.447121,,,...,False,False,False,False,True,False,Day,Day,Day,Day
4,A-1180743,MapQuest,246.0,3,2019-01-19 12:45:07+00:00,2019-01-19 13:29:26+00:00,33.941284,-84.504784,,,...,False,False,False,False,False,False,Day,Day,Day,Day


In [9]:
data_df.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance_mi_',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature_F_', 'Wind_Chill_F_', 'Humidity___', 'Pressure_in_',
       'Visibility_mi_', 'Wind_Direction', 'Wind_Speed_mph_',
       'Precipitation_in_', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [10]:
data_df.describe()

Unnamed: 0,TMC,Severity,Start_Lat,Start_Lng,Distance_mi_,Number,Temperature_F_,Wind_Chill_F_,Humidity___,Pressure_in_,Visibility_mi_,Wind_Speed_mph_,Precipitation_in_
count,2246264.0,2974335.0,2974335.0,2974335.0,2974335.0,1056730.0,2918272.0,1121712.0,2915162.0,2926193.0,2908644.0,2533495.0,975977.0
mean,207.8316,2.36019,36.49361,-95.42625,0.2855654,5837.004,62.3512,51.32685,65.40542,29.8319,9.15077,8.298064,0.020495
std,20.32959,0.5414733,4.918849,17.21881,1.548392,15159.28,18.78855,25.19127,22.55676,0.7213808,2.892114,5.138546,0.23577
min,200.0,1.0,24.55527,-124.6238,0.0,0.0,-77.8,-65.9,1.0,0.0,0.0,0.0,0.0
25%,201.0,2.0,33.5504,-117.292,0.0,837.0,50.0,32.0,49.0,29.82,10.0,4.6,0.0
50%,201.0,2.0,35.84969,-90.25083,0.0,2717.0,64.4,54.0,67.0,29.98,10.0,7.0,0.0
75%,201.0,3.0,40.37026,-80.91891,0.01,7000.0,76.0,73.0,84.0,30.11,10.0,10.4,0.0
max,406.0,4.0,49.0022,-67.11317,333.63,9999997.0,170.6,115.0,100.0,33.04,140.0,822.8,25.0


In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974335 entries, 0 to 2974334
Data columns (total 49 columns):
ID                       object
Source                   object
TMC                      float64
Severity                 int64
Start_Time               datetime64[ns, UTC]
End_Time                 datetime64[ns, UTC]
Start_Lat                float64
Start_Lng                float64
End_Lat                  object
End_Lng                  object
Distance_mi_             float64
Description              object
Number                   float64
Street                   object
Side                     object
City                     object
County                   object
State                    object
Zipcode                  object
Country                  object
Timezone                 object
Airport_Code             object
Weather_Timestamp        datetime64[ns, UTC]
Temperature_F_           float64
Wind_Chill_F_            float64
Humidity___              float64
Pressure_

## Feature Preprocessing

In [12]:
data_df['Start_Time'] = pd.to_datetime(data_df['Start_Time'], errors='coerce')
data_df['End_Time'] = pd.to_datetime(data_df['End_Time'], errors='coerce')

In [13]:
# Extract year, month, day, hour and weekday

data_df['Year'] = data_df['Start_Time'].dt.year
data_df['Month'] = data_df['Start_Time'].dt.strftime('%b')
data_df['Day'] = data_df['Start_Time'].dt.day
data_df['Hour'] = data_df['Start_Time'].dt.hour
data_df['Weekday'] = data_df['Start_Time'].dt.strftime('%a')

**Time in the unit of minutes for each accident**

In [14]:
time_duration = 'Time_Duration_Min'

In [15]:
data_df[time_duration] = round((data_df['End_Time'] - data_df['Start_Time'])/np.timedelta64(1,'m'))

In [16]:
data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Weekday,Time_Duration_Min
0,A-3418,MapQuest,222.0,2,2016-07-18 17:05:24+00:00,2016-07-18 18:20:24+00:00,38.932369,-121.090813,,,...,Day,Day,Day,Day,2016,Jul,18,17,Mon,75.0
1,A-25968,MapQuest,247.0,2,2016-08-05 12:42:34+00:00,2016-08-05 13:27:34+00:00,37.788002,-122.393623,,,...,Day,Day,Day,Day,2016,Aug,5,12,Fri,45.0
2,A-28018,MapQuest,222.0,3,2016-08-19 20:53:29+00:00,2016-08-19 22:08:29+00:00,38.773979,-121.241997,,,...,Night,Night,Day,Day,2016,Aug,19,20,Fri,75.0
3,A-38797,MapQuest,248.0,2,2016-05-17 18:25:22+00:00,2016-05-17 18:55:22+00:00,37.800373,-122.447121,,,...,Day,Day,Day,Day,2016,May,17,18,Tue,30.0
4,A-1180743,MapQuest,246.0,3,2019-01-19 12:45:07+00:00,2019-01-19 13:29:26+00:00,33.941284,-84.504784,,,...,Day,Day,Day,Day,2019,Jan,19,12,Sat,44.0


In [17]:
data_df.tail()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Weekday,Time_Duration_Min
2974330,A-643009,MapQuest,245.0,3,2019-11-09 17:57:49+00:00,2019-11-09 20:05:14+00:00,30.439508,-91.195831,,,...,Night,Night,Day,Day,2019,Nov,9,17,Sat,127.0
2974331,A-825446,MapQuest,245.0,3,2019-07-28 17:02:02+00:00,2019-07-28 17:30:23+00:00,30.484991,-91.166161,,,...,Day,Day,Day,Day,2019,Jul,28,17,Sun,28.0
2974332,A-1668519,MapQuest,245.0,3,2018-05-19 15:49:07+00:00,2018-05-19 16:48:41+00:00,30.499462,-91.159531,,,...,Day,Day,Day,Day,2018,May,19,15,Sat,60.0
2974333,A-1966071,MapQuest,245.0,3,2017-12-15 19:02:43+00:00,2017-12-15 19:32:28+00:00,30.442533,-91.007462,,,...,Night,Night,Night,Night,2017,Dec,15,19,Fri,30.0
2974334,A-2094111,MapQuest,245.0,3,2017-10-22 17:25:57+00:00,2017-10-22 17:55:38+00:00,30.422909,-91.139572,,,...,Day,Day,Day,Day,2017,Oct,22,17,Sun,30.0


### Remove Entries with negative time duration

In [18]:
neg_outliers = data_df[time_duration] <= 0
data_df[neg_outliers] = np.nan
data_df.dropna(subset=[time_duration],axis=0,inplace=True)

In [27]:
features_imp = ['TMC',
                'Start_Lng',
                'Start_Lat',
                'Distance_mi_',
                'Temperature_F_',
                'Humidity___',
                'Pressure_in_',
                'Hour',
                'State',
                'Severity',
                'Time_Duration_Min']

In [28]:
data_df_ml = data_df[features_imp].copy()

In [29]:
data_df_ml.dropna(subset=data_df_ml.columns[data_df_ml.isnull().mean()!=0], how='any', axis=0, inplace=True)

In [30]:
data_df_ml.head()

Unnamed: 0,TMC,Start_Lng,Start_Lat,Distance_mi_,Temperature_F_,Humidity___,Pressure_in_,Hour,State,Severity,Time_Duration_Min
0,222.0,-121.090813,38.932369,0.0,80.6,26.0,30.05,17.0,CA,2.0,75.0
1,247.0,-122.393623,37.788002,0.0,64.9,70.0,29.94,12.0,CA,2.0,45.0
2,222.0,-121.241997,38.773979,0.0,80.6,37.0,29.8,20.0,CA,3.0,75.0
3,248.0,-122.447121,37.800373,0.0,77.0,74.0,29.89,18.0,CA,2.0,30.0
4,246.0,-84.504784,33.941284,0.0,59.2,100.0,29.77,12.0,GA,3.0,44.0


## Modelling

Note - Processing all states at a time is resource heavy using normal data preprocessing. Using PySpark would resolve the issue

In [31]:
target = 'Severity'

In [32]:
data_df_ml_en = pd.get_dummies(data_df_ml, drop_first=True)

In [34]:
data_df_ml_en.to_csv('data-processed/us_accidents_dec19.csv', index=False)

In [49]:
sparkContext = SparkContext.getOrCreate()
spark = SparkSession(sparkContext)

In [50]:
data_df = spark.read.csv('data-processed/us_accidents_dec19.csv', header = True, inferSchema = True)
data_df.printSchema()

root
 |-- TMC: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Distance_mi_: double (nullable = true)
 |-- Temperature_F_: double (nullable = true)
 |-- Humidity___: double (nullable = true)
 |-- Pressure_in_: double (nullable = true)
 |-- Hour: double (nullable = true)
 |-- Severity: double (nullable = true)
 |-- Time_Duration_Min: double (nullable = true)
 |-- State_AR: integer (nullable = true)
 |-- State_AZ: integer (nullable = true)
 |-- State_CA: integer (nullable = true)
 |-- State_CO: integer (nullable = true)
 |-- State_CT: integer (nullable = true)
 |-- State_DC: integer (nullable = true)
 |-- State_DE: integer (nullable = true)
 |-- State_FL: integer (nullable = true)
 |-- State_GA: integer (nullable = true)
 |-- State_IA: integer (nullable = true)
 |-- State_ID: integer (nullable = true)
 |-- State_IL: integer (nullable = true)
 |-- State_IN: integer (nullable = true)
 |-- State_KS: integer (nullable = true)
 |

In [51]:
input_features = data_df.columns
input_features.remove('Severity')

In [52]:
assembler = VectorAssembler(inputCols = input_features , outputCol = 'features')

In [53]:
final_df = assembler.transform(data_df)

In [54]:
train, test = final_df.randomSplit([0.7, 0.3])

In [56]:
random_forest_classifier = RandomForestClassifier(labelCol = 'Severity', featuresCol = 'features')

In [57]:
decision_tree_classifier = DecisionTreeClassifier(labelCol = 'Severity', featuresCol = 'features')

In [58]:
rf_model = random_forest_classifier.fit(train)

In [59]:
rf_predictions = rf_model.transform(test)

In [60]:
dt_model = decision_tree_classifier.fit(train)

In [61]:
dt_predictions = dt_model.transform(test)

In [62]:
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'Severity', metricName = 'accuracy')

In [64]:
logistic_regression = LogisticRegression(labelCol = 'Severity', featuresCol = 'features')

In [65]:
lr_model = logistic_regression.fit(train)

In [66]:
lr_predictions = lr_model.transform(test)

In [68]:
print('Decision Tree Accuracy:', multi_evaluator.evaluate(dt_predictions))
print('Random Forest Accuracy:', multi_evaluator.evaluate(rf_predictions))
print('Logistic Regression:', multi_evaluator.evaluate(lr_predictions))

Decision Tree Accuracy: 0.6929468555172205
Random Forest Accuracy: 0.6657784032443557
Logistic Regression: 0.6859224623206828
