# Seattle Collisions Data Clean-Up, Feature Engineering

This notebook covers data work and feature engineering.

In [None]:
#!conda install -c conda-forge pygeohash -y
#!conda install -c conda-forge shapely -y
#!conda install -c conda-forge astral -y

In [3]:
from shapely.geometry import Polygon
from shapely.geometry import shape, Point
from shapely.ops import nearest_points
from astral import Astral
import pygeohash as gh

import json
import pandas as pd
import types
import itertools
import numpy as np
from datetime import datetime, timedelta, timezone 
import time

import os
import folium
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
%matplotlib inline

In [4]:
df = pd.read_csv('Seattle_Collisions_Weather.csv', low_memory=False, index=0)
print('File downloaded')

File downloaded


In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
X,-122.323,-122.347,-122.335,-122.335,-122.306
Y,47.7031,47.6472,47.6079,47.6048,47.5457
OBJECTID,1,2,3,4,5
ADDRTYPE,Intersection,Block,Block,Block,Intersection
LOCATION,5TH AVE NE AND NE 103RD ST,AURORA BR BETWEEN RAYE ST AND BRIDGE WAY N,4TH AVE BETWEEN SENECA ST AND UNIVERSITY ST,2ND AVE BETWEEN MARION ST AND MADISON ST,SWIFT AVE S AND SWIFT AV OFF RP
SEVERITYCODE,2,1,1,1,2
SEVERITYDESC,Injury Collision,Property Damage Only Collision,Property Damage Only Collision,Property Damage Only Collision,Injury Collision
COLLISIONTYPE,Angles,Sideswipe,Parked Car,Other,Angles
PERSONCOUNT,2,2,4,3,2


In [6]:
#df.drop(columns=['Unnamed: 0', 'OBJECTID','SEVERITYDESC','INCDATE'], inplace = True)
#df.drop(columns=['DATE'], inplace = True)
df.drop(columns=['ST_COLCODE', 'ST_COLDESC'], inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189339 entries, 0 to 189338
Data columns (total 42 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         189339 non-null  int64  
 1   X                  189339 non-null  float64
 2   Y                  189339 non-null  float64
 3   OBJECTID           189339 non-null  float64
 4   ADDRTYPE           189339 non-null  object 
 5   LOCATION           189339 non-null  object 
 6   SEVERITYCODE       189339 non-null  float64
 7   SEVERITYDESC       189339 non-null  object 
 8   COLLISIONTYPE      184582 non-null  object 
 9   PERSONCOUNT        189339 non-null  float64
 10  PEDCOUNT           189339 non-null  float64
 11  PEDCYLCOUNT        189339 non-null  float64
 12  VEHCOUNT           189339 non-null  float64
 13  INCDATE            189339 non-null  object 
 14  INCDTTM            189339 non-null  object 
 15  JUNCTIONTYPE       185146 non-null  object 
 16  SD

In [8]:
df["WINDCHILL"] = df["WINDCHILL"].fillna(0)
df["SNOWDEPTH"] = df["SNOWDEPTH"].fillna(0)
df["VISIBILITY"] = df["VISIBILITY"].fillna(0)

In [9]:
def peak_offpeak(input_date):
    hour = input_date.hour
    try:
        return 1 if (hour >= 8 and hour <= 18) else 0
    except:
        return 0

In [10]:
df['INCDTTM'] = pd.to_datetime(df['INCDTTM'])
df['YEAR'] = df['INCDTTM'].dt.year
df['MONTH'] = df['INCDTTM'].dt.month
df['WEEKDAY'] = df['INCDTTM'].dt.weekday
df['HOUR'] = df['INCDTTM'].dt.strftime('%H').astype(int)
df['PEAKOROFFPEAK'] = df['INCDTTM'].apply(lambda x: peak_offpeak(x))

In [11]:
# Apply major clean-up here
df["SEVERITYCODE"] = df["SEVERITYCODE"].apply(lambda x : 0 if x == 1 else 1)
df["HITPARKEDCAR"] = df["HITPARKEDCAR"].apply(lambda x : 1 if x == 'Y' else 0)
df["PEDROWNOTGRNT"] = df["PEDROWNOTGRNT"].apply(lambda x : 1 if x == 'Y' else 0)
df["SPEEDING"] = df["SPEEDING"].apply(lambda x : 1 if x == 'Y' else 0)
df["INATTENTIONIND"] = df["INATTENTIONIND"].apply(lambda x : 1 if x == 'Y' else 0)
df["UNDERINFL"] = df["UNDERINFL"].fillna(0).apply(lambda x : 1 if x =='Y' else (0 if x == 'N' else int(x)))

The categorical features needed some clean-up as well.  Check EDA for details on these but all of these are good potential candidates for modeling.

In [12]:
df["WEATHER"] = df["WEATHER"].apply(lambda x: "OVERCAST" if x == "Overcast"\
                                                     else ("RAINING" if x == "Raining"\
                                                           else ("CLEAR" if x == "Clear"\
                                                                 else ("UNKNOWN" if x =="Unknown" else "NOISE" ))))

In [13]:
df["ROADCOND"] = df["ROADCOND"].apply(lambda x: "DRY" if x == "Dry"\
                                                       else ("WET" if x == "Wet"\
                                                             else ("UNKNOWN" if x =="Unknown"\
                                                                   else "NOISE" )))

In [14]:
df["LIGHTCOND"] = df["LIGHTCOND"].apply(lambda x: "DAYLIGHT" if x == "Daylight" \
                                                         else ("DARK-STREET-LIGHTS-ON" if x == "Dark - Street Lights On" \
                                                               else ("UNKNOWN" if x == "Unknown" \
                                                                     else ("DUSK" if x =="Dusk" \
                                                                           else "NOISE" ))))

In [15]:
df["JUNCTIONTYPE"] = df["JUNCTIONTYPE"].apply(lambda x: "MID-BLOCK-UNRELATED" if x == "Mid-Block (not related to intersection)"\
                                                               else ("INTERSECTION-RELATED" if x == "At Intersection (intersection related)"\
                                                                     else ("MID-BLOCK-RELATED" if x == "Mid-Block (but intersection related)"\
                                                                           else ("DRIVEWAY" if x =="Driveway Junction" else "NOISE" ))))

In [16]:
df["ADDRTYPE"] = df["ADDRTYPE"].apply(lambda x : 0 if x=='Block' else 1)
#df_features['ADDRTYPE'] = \
#df_features['ADDRTYPE'].fillna('Block').replace(\
#                                       ['Alley', 'Block', 'Intersection'], ['0', '1', '1']).astype('float64')

In [17]:
df['COLLISIONTYPE'] = \
df['COLLISIONTYPE'].fillna('Other').replace(\
                                            ['Angles','Cycles','Head On', 'Left Turn', 'Other', 'Parked Car',\
                                             'Pedestrian', 'Rear Ended', 'Right Turn', 'Sideswipe'],\
                                            ['0','1','2','3','4','5','6','7','8','9']).astype('float64')

In [19]:
df['MCCP'].unique()

array(['NORTHGATE', 'FREMONT', 'DOWNTOWN COMMERICAL', 'SOUTH BEACON HILL',
       'BALLARD NORTH', 'SLU/CASCADE', 'CAPITOL HILL', 'FAUNTLEROY SW',
       'GEORGETOWN', 'COMMERCIAL DUWAMISH', 'FIRST HILL', 'UNIVERSITY',
       'SODO', 'NORTH ADMIRAL', 'LAKECITY', 'COLUMBIA CITY',
       'JUDKINS PARK', 'CENTRAL AREA/SQUIRE PARK', 'MOUNT BAKER',
       'BRIGHTON/DUNLAP', 'ROOSEVELT/RAVENNA', 'RAINIER BEACH',
       'BALLARD SOUTH', 'MORGAN', 'SOUTH PARK', 'HIGHLAND PARK',
       'GREENWOOD', 'NORTH BEACON/JEFFERSON PARK', 'ALKI', 'SANDPOINT',
       'NORTH BEACON HILL', 'MILLER PARK',
       'ROXHILL/WESTWOOD/ARBOR HEIGHTS', 'NORTH DELRIDGE',
       'INTERNATIONAL DISTRICT - WEST', 'EASTLAKE - WEST', 'BELLTOWN',
       'QUEEN ANNE', 'HIGH POINT', 'MADRONA/LESCHI', nan, 'WALLINGFORD',
       'ALASKA JUNCTION', 'MAGNOLIA', 'MID BEACON HILL',
       'NORTH CAPITOL HILL', 'MONTLAKE/PORTAGE BAY', 'MADISON PARK',
       'PHINNEY RIDGE', 'PIONEER SQUARE', 'BITTERLAKE', 'HILLMAN CITY',
       'R

In [20]:
null_series = pd.isnull(df['MCCP'])
df[null_series]

Unnamed: 0.1,Unnamed: 0,X,Y,OBJECTID,ADDRTYPE,LOCATION,SEVERITYCODE,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,...,WINDCHILL,PRECIPITATION,SNOWDEPTH,VISIBILITY,CLOUDCOVER,WEATHERCONDITIONS,SUNSET,SUNRISE,MOONPHASE,PEAKOROFFPEAK
94,94,-122.311398,47.733936,113.0,0,NE 145TH ST BETWEEN 15TH AVE NE AND 17TH AVE NE,0,Property Damage Only Collision,4.0,1.0,...,0.0,0.00,0.0,9.9,42.4,Partially cloudy,2020-04-19T20:05:39-07:00,2020-04-19T06:11:50-07:00,0.96,0
124,124,-122.354917,47.734130,146.0,0,N 145TH ST BETWEEN GREENWOOD AVE N AND PHINNEY...,0,Property Damage Only Collision,3.0,3.0,...,0.0,0.00,0.0,9.9,0.0,Clear,2006-09-12T19:27:35-07:00,2006-09-12T06:42:34-07:00,0.66,1
140,140,-122.293809,47.733764,165.0,1,32ND AVE NE AND NE 145TH ST,1,Injury Collision,7.0,2.0,...,0.0,0.00,0.0,9.4,75.5,Overcast,2019-07-09T21:07:12-07:00,2019-07-09T05:21:29-07:00,0.24,0
247,247,-122.301940,47.733829,288.0,1,25TH AVE NE AND NE 145TH ST,1,Injury Collision,0.0,2.0,...,0.0,0.00,0.0,9.9,0.0,Clear,2006-05-03T20:24:20-07:00,2006-05-03T05:48:52-07:00,0.14,0
291,291,-122.345097,47.734128,340.0,1,AURORA AVE N AND N 145TH ST,1,Injury Collision,6.0,2.0,...,46.1,0.03,0.0,9.7,100.0,"Rain, Overcast",2005-12-21T16:20:20-08:00,2005-12-21T07:54:59-08:00,0.66,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188987,188987,-122.356511,47.517361,219124.0,1,17TH AVE SW AND SW ROXBURY ST,1,Injury Collision,0.0,3.0,...,36.9,0.00,0.0,9.9,90.0,Overcast,2018-12-15T16:18:06-08:00,2018-12-15T07:51:01-08:00,0.23,0
189040,189040,-122.312749,47.733952,219187.0,1,15TH AVE NE AND NE 145TH ST,0,Property Damage Only Collision,4.0,1.0,...,41.5,0.00,0.0,9.9,100.0,Overcast,2018-12-30T16:26:32-08:00,2018-12-30T07:57:33-08:00,0.84,0
189055,189055,-122.312749,47.733952,219204.0,1,15TH AVE NE AND NE 145TH ST,0,Property Damage Only Collision,7.0,2.0,...,0.0,0.00,0.0,1.9,100.0,Overcast,2018-10-08T18:34:31-07:00,2018-10-08T07:18:15-07:00,1.00,1
189250,189250,-122.352318,47.517344,219441.0,0,SW ROXBURY ST BETWEEN 13TH AVE SW AND 14TH AVE SW,0,Property Damage Only Collision,7.0,2.0,...,0.0,0.00,0.0,9.9,100.0,Overcast,2019-01-03T16:30:22-08:00,2019-01-03T07:57:31-08:00,0.98,1


Read about MCCPs here - http://www.seattle.gov/police/information-and-data/mcpp-about.  The SDOT data does not have this so it needed to be added.  Each MCCP is a few square miles and there are about 60 across Seattle.  I these these throughout the project for did not end up using MCCP for modeling, it was used for clustering though.  In the future it would be a good feature for Choropleth mapping but was not attempted for this effort.

In general there isn't any regionality in the data so it was worth using.  The geojson is available in Github.

In [21]:
#!wget --quiet https://raw.githubusercontent.com/seattleio/seattle-boundaries-data/master/data/spd-mcpp-areas.geojson
#print('GeoJSON file downloaded!')

#df['MCCP'] = ''

with open('spd-mcpp-areas.geojson') as f:
    seattle_mccp_json = json.load(f)

for index, row in df.iterrows():
    point = Point(row['X'], row['Y'])
    for feature in seattle_mccp_json['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            df.loc[index, 'MCCP'] = feature['properties']['NAME']
            break

In [22]:
df.dropna(subset=['MCCP'], inplace = True)

Geohashing was used to make it easier to work with the location data.  To revisited is adding another lower resolution GH.
https://en.wikipedia.org/wiki/Geohashing


In [23]:
# 6   ± 0.61 km, 0.3790364 mi
df['geohash_6'] = df.apply(lambda x: gh.encode(x.Y, x.X, precision=6), axis=1)
# 7   ± 0.076 km, 0.04722421 mi
df['geohash_7'] = df.apply(lambda x: gh.encode(x.Y, x.X, precision=7), axis=1)
# 8   ± 0.019 km, 0.01180605 mi
df['geohash_8'] = df.apply(lambda x: gh.encode(x.Y, x.X, precision=8), axis=1)

Sun-Related Features - Seattle has a lot of North and South bound traffic 

In [24]:
a = Astral()
a.solar_depression = 'civil'
seattle_astral  = a['Seattle']

def solar_azimuth(ts):
    try:
        return seattle_astral.solar_azimuth(ts)
    except:
        return np.nan

def solar_elevation(ts):
    try:
        return seattle_astral.solar_elevation(ts)
    except:
        return np.nan

In [25]:
df['solar_azimuth'] = df.apply(lambda x: solar_azimuth(x.INCDTTM), axis=1)
df['solar_azimuth_round'] = df.apply(lambda x: round(x.solar_azimuth), axis=1)
df['solar_elevation'] = df.apply(lambda x: solar_elevation(x.INCDTTM), axis=1)

In [None]:
rs = np.random.RandomState(0)
corr = df.corr()
cm = sns.light_palette("green", as_cmap=True)
corr.style.background_gradient(cmap=cm).set_precision(2)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187913 entries, 0 to 189338
Data columns (total 49 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Unnamed: 0           187913 non-null  int64         
 1   X                    187913 non-null  float64       
 2   Y                    187913 non-null  float64       
 3   OBJECTID             187913 non-null  float64       
 4   ADDRTYPE             187913 non-null  int64         
 5   LOCATION             187913 non-null  object        
 6   SEVERITYCODE         187913 non-null  int64         
 7   SEVERITYDESC         187913 non-null  object        
 8   COLLISIONTYPE        187913 non-null  float64       
 9   PERSONCOUNT          187913 non-null  float64       
 10  PEDCOUNT             187913 non-null  float64       
 11  PEDCYLCOUNT          187913 non-null  float64       
 12  VEHCOUNT             187913 non-null  float64       
 13  INCDATE       

In [63]:
#df.to_csv('Seattle_Collisions_Final.csv')