# Capstone project Notebook: Seattle collisions

In [166]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

In [167]:
# Data has been fetched from the Seattle city web page
df = pd.read_csv('Seattle_Collisions.csv')
print(df.shape)
df.columns

(221006, 40)


Index(['X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO', 'STATUS',
       'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC',
       'SEVERITYCODE', 'SEVERITYDESC', 'COLLISIONTYPE', 'PERSONCOUNT',
       'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES',
       'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND',
       'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE',
       'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [210]:
#SEVERITYCODE is our target variable. The example dataset only contained values "1" & "2". The new dataset sampleswith all the values
df['SEVERITYCODE'].value_counts()

1     137335
2      58628
0      21601
2b      3092
3        349
Name: SEVERITYCODE, dtype: int64

In [211]:
# lets have a look at the relation severity code vs weather
df.groupby(['SEVERITYCODE'])['WEATHER'].value_counts(normalize=True)


SEVERITYCODE  WEATHER                 
0             Raining                     1.000000
1             Clear                       0.568606
              Raining                     0.165957
              Overcast                    0.143581
              Unknown                     0.106940
              Snowing                     0.005529
              Other                       0.005409
              Fog/Smog/Smoke              0.002862
              Sleet/Hail/Freezing Rain    0.000637
              Blowing Sand/Dirt           0.000307
              Severe Crosswind            0.000135
              Partly Cloudy               0.000037
2             Clear                       0.627649
              Raining                     0.195668
              Overcast                    0.153272
              Unknown                     0.014161
              Fog/Smog/Smoke              0.003263
              Snowing                     0.002968
              Other                       0

In [212]:
# lets have a look at the relation severity code vs road conditions
df.groupby(['SEVERITYCODE'])['ROADCOND'].value_counts(normalize=True)

SEVERITYCODE  ROADCOND      
0             Wet               1.000000
1             Dry               0.637806
              Wet               0.239456
              Unknown           0.107417
              Ice               0.007017
              Snow/Slush        0.006283
              Other             0.000674
              Standing Water    0.000644
              Sand/Mud/Dirt     0.000404
              Oil               0.000300
2             Dry               0.701469
              Wet               0.275703
              Unknown           0.013079
              Ice               0.004770
              Snow/Slush        0.002897
              Other             0.000746
              Standing Water    0.000520
              Oil               0.000416
              Sand/Mud/Dirt     0.000399
2b            Dry               0.730946
              Wet               0.248686
              Unknown           0.009855
              Ice               0.005913
              Snow/Slush    

## Data preprocessing

Select the feature attributes for our model: prediction of the severity accident based on weather and road conditions

In [213]:
dfSeattle = df[['SEVERITYCODE', 'WEATHER', 'ROADCOND']].copy()
print(dfSeattle.shape)
dfSeattle.dtypes


(221006, 3)


SEVERITYCODE    object
WEATHER         object
ROADCOND        object
dtype: object


Identify and drop all the missing values

In [214]:
# Drop all the rows with no values in SEVERITYCODE
dfSeattle.dropna(subset=["SEVERITYCODE"], axis=0, inplace=True)
dfSeattle.dropna(subset=["WEATHER"], axis=0, inplace=True)
dfSeattle.dropna(subset=["ROADCOND"], axis=0, inplace=True)
print(dfSeattle.shape)

dfSeattle.drop_duplicates()
dfSeattle['SEVERITYCODE'].value_counts()

(194438, 3)


1     133449
2      57611
2b      3039
3        338
0          1
Name: SEVERITYCODE, dtype: int64

After the cleaning, most of the unknown cases (0) are gone

## Convert Categorical features to numerical values

In classification models, the target variable can be categorical. Therefore no changes are required for SEVERITYCODE


One Hot Encoding will be used to convert the categorical features to numerical variables

In [185]:
dummyWeather = pd.get_dummies(dfSeattle['WEATHER'])
dfSeattle = pd.concat([dfSeattle, dummyWeather], axis=1)
dfSeattle.drop(['WEATHER'], axis=1, inplace= True)

dfSeattle.head()


Unnamed: 0,SEVERITYCODE,ROADCOND,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Other,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing,Unknown
2,1,Dry,0,1,0,0,0,0,0,0,0,0,0
3,1,Dry,0,1,0,0,0,0,0,0,0,0,0
5,2,Dry,0,1,0,0,0,0,0,0,0,0,0
7,1,Dry,0,1,0,0,0,0,0,0,0,0,0
8,1,Dry,0,1,0,0,0,0,0,0,0,0,0


In [216]:
dummyRoad = pd.get_dummies(dfSeattle['ROADCOND'])
dfSeattle = pd.concat([dfSeattle, dummyRoad], axis=1)
dfSeattle.drop(['ROADCOND'], axis=1, inplace= True)

dfSeattle.head()


Unnamed: 0,SEVERITYCODE,WEATHER,Dry,Ice,Oil,Other,Sand/Mud/Dirt,Snow/Slush,Standing Water,Unknown,Wet
2,1,Clear,1,0,0,0,0,0,0,0,0
3,1,Clear,1,0,0,0,0,0,0,0,0
5,2,Clear,1,0,0,0,0,0,0,0,0
7,1,Clear,1,0,0,0,0,0,0,0,0
8,1,Clear,1,0,0,0,0,0,0,0,0


In [149]:
print(dfSeattle.shape)
dfSeattle.dtypes

(194438, 21)


SEVERITYCODE                int32
Blowing Sand/Dirt           uint8
Clear                       uint8
Fog/Smog/Smoke              uint8
Other                       uint8
Overcast                    uint8
Partly Cloudy               uint8
Raining                     uint8
Severe Crosswind            uint8
Sleet/Hail/Freezing Rain    uint8
Snowing                     uint8
Unknown                     uint8
Dry                         uint8
Ice                         uint8
Oil                         uint8
Other                       uint8
Sand/Mud/Dirt               uint8
Snow/Slush                  uint8
Standing Water              uint8
Unknown                     uint8
Wet                         uint8
dtype: object

Variables have now the right format and missing values have been dropped  
It is time to prepare the variables for the model

In [217]:
y = dfSeattle['SEVERITYCODE'].values


In [193]:
dfSeattle.drop(['SEVERITYCODE'], axis=1, inplace= True)
X = dfSeattle
X[0:5]

Unnamed: 0,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Other,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing,Unknown,Dry,Ice,Oil,Other.1,Sand/Mud/Dirt,Snow/Slush,Standing Water,Unknown.1,Wet
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


## Building a classification model: Decision Tree

A Decision Tree machine learning algorithm will be used to build a model from the historical collision data already prepared 

In [228]:
# We split the X into train and test to find the best k
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (155550, 20) (155550,)
Test set: (38888, 20) (38888,)


In [229]:
from sklearn.tree import DecisionTreeClassifier
DT_Seattle = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
DT_Seattle.fit(X_train,y_train)
DT_Seattle

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [230]:
yhat = DT_Seattle.predict(X_test)
yhat

array(['1', '1', '1', ..., '1', '1', '1'], dtype=object)

In [231]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score

DT_yhat = DT_Seattle.predict(X_test)
print("DT Jaccard index: %.2f" % jaccard_similarity_score(y_test, DT_yhat))
print("DT F1-score: %.2f" % f1_score(y_test, DT_yhat, average='weighted', labels=np.unique(DT_yhat)) )



DT Jaccard index: 0.69
DT F1-score: 0.82


The model is able to predict the correct values in the test dataset 80% of the times