In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('acc.csv')

In [3]:
df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [4]:
#Only using accidents reported from 'MapQuest', as different sources tend to have different opinions about the 'severity' of the accident.
df = df[df.Source=="MapQuest"]
df = df.drop(columns=['Source'])

In [5]:
#Removing features that are not required
df=df.drop(columns=['ID','TMC','Description','Distance(mi)','Start_Time','End_Time','End_Lat','End_Lng','Country','Turning_Loop','Weather_Timestamp','Number','Wind_Chill(F)','Bump','Give_Way','No_Exit','Roundabout','Traffic_Calming','Street', 'City', 'County', 'Zipcode', 'Airport_Code'])

In [6]:
#Using one hot encoding for 'Weather_Condition' after combining similar weather conditions
#Replacing na values with 'False'
df['Clear'] = df['Weather_Condition'].str.contains('Clear',case=False,na=False)
df['Cloud'] = df['Weather_Condition'].str.contains('Cloud|Overcast',case=False,na=False)
df['Rain'] = df['Weather_Condition'].str.contains('Rain|Storm',case=False,na=False)
df['Heavy_Rain'] = df['Weather_Condition'].str.contains('Heavy T-Storm|Heavy Rain|Heavy Thunderstorms|Rain Shower',case=False,na=False)
df['Snow'] = df['Weather_Condition'].str.contains('Snow|Sleet|Ice',case=False,na=False)
df['Heavy_Snow'] = df['Weather_Condition'].str.contains('Heavy Snow|Snow Showers|Heavy Sleet|Heavy Ice Pellets|Squalls',case=False,na=False)
df['Fog'] = df['Weather_Condition'].str.contains('fog',case=False,na=False)

In [7]:
#Assign NA to created weather features where 'Weather_Condition' is NA
mask=df.loc[df['Weather_Condition'].isnull()]['Weather_Condition']
for weather in ['Clear','Cloud','Rain','Heavy_Rain','Snow','Heavy_Snow','Fog']:
    df.loc[df['Weather_Condition'].isnull(),weather] = mask

In [8]:
#We can drop the 'Weather_Condition' as we have used one hot encoding for it.
df = df.drop(columns=['Weather_Condition'])

In [9]:
#Replacing missing values in 'Precipitation(in)' with the median value
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(df['Precipitation(in)'].median())

In [10]:
#Dropping the rows where any one of the below features are NA as the count of missing values are very low in these features
df = df.dropna(subset=['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight',
                      'Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)',
                      'Wind_Direction','Clear','Cloud','Rain','Heavy_Rain','Snow','Heavy_Snow','Fog'])

In [11]:
#Severity Level 4 is condsidered as High Severity accident and the rest of the Levels are considered as Low Severity accident
df['Severity4'] = 1
df.loc[df['Severity']!=4,'Severity4'] = 0
df = df.drop(['Severity'],axis=1)

In [12]:
# One-hot encode categorical features
df = df.replace([True, False], [1,0])
categorical_data = ['Side','State','Timezone','Wind_Direction']
df[categorical_data] = df[categorical_data].astype('category')
df = pd.get_dummies(df, columns=categorical_data, drop_first=True)

In [13]:
df = df.replace(['Day','Night'], [1,0])

In [14]:
df['Severity4'].value_counts()

0    2049335
1       6281
Name: Severity4, dtype: int64

In [15]:
##The dataset is severely skewed in the distribution of severity4 (ratio of 1:326)
#Therefore we oversample(duplicate random entries) the minority class and undersample(delete random entries) the majority class
df_bl = pd.concat([df[df['Severity4']==1].sample(40000, replace = True),df[df['Severity4']==0].sample(40000)],axis=0)

In [16]:
#Train Test Split
X = df_bl.drop(columns=['Severity4'])
y= df_bl['Severity4']

from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)

# split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [17]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
param_grid = {'max_depth': [2, 4, 6, 8]}
CV_DT = GridSearchCV(DecisionTreeClassifier(random_state=0),param_grid)
CV_DT.fit(X_train, y_train)

# Training step, on X_train with y_train
tree_clf = tree.DecisionTreeClassifier(min_samples_split = 5)
tree_clf = tree_clf.fit(X_train,y_train)

tree_accuracy_train = tree_clf.score(X_train, y_train)
print("Train Accuracy: %f"% (tree_accuracy_train*100))
tree_accuracy_test = tree_clf.score(X_test,y_test)
print("Test Accuracy: %f"% (tree_accuracy_test*100))

Train Accuracy: 99.745313
Test Accuracy: 93.950000
