In [2]:
import pandas as pd
from pandas.core import datetools
import numpy as np
import unicodedata
import time
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

%config InlineBackend.figure_format = 'png'
plt.rcParams['figure.dpi']= 300

## Pedestrians and Bicyclists
*Note: Ultimately decided against using this table - the vast majority of pedestrians and bicyclists represented here did not survive. :(*

In [None]:
df = pd.read_csv('2015-traffic-fatalities/pbtype.csv')

In [8]:
df.head()

Unnamed: 0,STATE,ST_CASE,VEH_NO,PER_NO,PBPTYPE,PBAGE,PBSEX,PBCWALK,PBSWALK,PBSZONE,...,PEDPOS,BIKEPOS,PEDDIR,BIKEDIR,MOTDIR,MOTMAN,PEDLEG,PEDSNR,PEDCGP,BIKECGP
0,1,10008,0,1,5,38,1,0,0,0,...,3,7,8,7,8,8,8,8,990,0
1,1,10031,0,1,5,87,1,0,0,0,...,1,7,1,7,4,3,2,3a,750,0
2,1,10040,0,1,5,20,1,0,0,0,...,3,7,8,7,8,8,8,8,310,0
3,1,10042,0,1,5,50,1,0,0,0,...,3,7,1,7,2,3,2,3b,750,0
4,1,10050,0,1,5,52,1,0,0,0,...,3,7,8,7,8,8,8,8,400,0


In [15]:
df.groupby(['PBAGE'])['ST_CASE'].count()

PBAGE
0        8
1       36
2       26
3       26
4       17
5       26
6       19
7       16
8       22
9       21
10      10
11      25
12      24
13      30
14      48
15      51
16      51
17      73
18      73
19      81
20     102
21     102
22     110
23      99
24     108
25     110
26      86
27      86
28      99
29     118
      ... 
69      66
70      67
71      53
72      65
73      38
74      44
75      52
76      40
77      42
78      41
79      36
80      30
81      29
82      34
83      35
84      20
85      27
86      26
87      28
88      25
89      20
90      14
91      14
92       9
93       9
94       5
95       3
96       3
998     43
999     29
Name: ST_CASE, Length: 99, dtype: int64

In [17]:
df.groupby(['PBSEX','PBPTYPE'])['ST_CASE'].count() # 1 male, 2 female; 5 ped, 6 bike, 7 other cycl, 8 other

PBSEX  PBPTYPE
1      5          3995
       6           723
       8           139
2      5          1787
       6           127
       7             1
       8            28
8      5             6
9      5             7
       6             1
Name: ST_CASE, dtype: int64

In [19]:
df.groupby(['PBPTYPE'])['ST_CASE'].count() # 5 ped, 6 bike, 7 other cycl, 8 other

PBPTYPE
5    5795
6     851
7       1
8     167
Name: ST_CASE, dtype: int64

**Data on Pedestrian Crash Group `PEDCGP` shows that most are plain pedestrians, some are "other"**  
None are 6 (bike) or 7 (other cyclist)

In [27]:
df.groupby(['PBPTYPE','PEDCGP'])['ST_CASE'].count() # 5 ped, 6 bike, 7 other cycl, 8 other

PBPTYPE  PEDCGP
5        100        664
         200         67
         310         61
         340         29
         350         50
         400        778
         460         20
         500         14
         600        389
         720         18
         740        432
         750       1957
         790        253
         800         67
         910        265
         990        731
6        0          851
7        0            1
8        100         26
         310          3
         400         43
         460          3
         600          2
         740          6
         750         52
         790          9
         800          1
         910          1
         990         21
Name: ST_CASE, dtype: int64

**Pedestrian Codes**  
0 Not a Pedestrian  
100 Unusual Circumstances  
200 Backing Vehicle  
310 Working or Playing in Roadway  
340 Bus-Related  
350 Unique Midblock  
400 Walking/Running Along Roadway  
460 Driveway Access/ Driveway Access Related  
500 Waiting to Cross  
600 Pedestrian in Roadway – Circumstances Unknown 720 Multiple Threat/Trapped  
740 Dash/Dart-Out  
750 Crossing Roadway – Vehicle Not Turning  
790 Crossing Roadway – Vehicle Turning  
800 Non-Trafficway  
910 Crossing Expressway  
990 Other/Unknown – Insufficient Details  

In [25]:
df.groupby(['PEDCGP','PBPTYPE'])['ST_CASE'].count().sort_values(ascending=False)

PEDCGP  PBPTYPE
750     5          1957
0       6           851
400     5           778
990     5           731
100     5           664
740     5           432
600     5           389
910     5           265
790     5           253
200     5            67
800     5            67
310     5            61
750     8            52
350     5            50
400     8            43
340     5            29
100     8            26
990     8            21
460     5            20
720     5            18
500     5            14
790     8             9
740     8             6
460     8             3
310     8             3
600     8             2
800     8             1
910     8             1
0       7             1
Name: ST_CASE, dtype: int64

## Person

In [29]:
df2 = pd.read_csv('2015-traffic-fatalities/person.csv')

In [54]:
df2.columns

Index(['STATE', 'ST_CASE', 'VE_FORMS', 'VEH_NO', 'PER_NO', 'STR_VEH', 'COUNTY',
       'DAY', 'MONTH', 'HOUR', 'MINUTE', 'RUR_URB', 'FUNC_SYS', 'HARM_EV',
       'MAN_COLL', 'SCH_BUS', 'MAKE', 'MAK_MOD', 'BODY_TYP', 'MOD_YEAR',
       'TOW_VEH', 'SPEC_USE', 'EMER_USE', 'ROLLOVER', 'IMPACT1', 'FIRE_EXP',
       'AGE', 'SEX', 'PER_TYP', 'INJ_SEV', 'SEAT_POS', 'REST_USE', 'REST_MIS',
       'AIR_BAG', 'EJECTION', 'EJ_PATH', 'EXTRICAT', 'DRINKING', 'ALC_DET',
       'ALC_STATUS', 'ATST_TYP', 'ALC_RES', 'DRUGS', 'DRUG_DET', 'DSTATUS',
       'DRUGTST1', 'DRUGTST2', 'DRUGTST3', 'DRUGRES1', 'DRUGRES2', 'DRUGRES3',
       'HOSPITAL', 'DOA', 'DEATH_DA', 'DEATH_MO', 'DEATH_YR', 'DEATH_HR',
       'DEATH_MN', 'DEATH_TM', 'LAG_HRS', 'LAG_MINS', 'P_SF1', 'P_SF2',
       'P_SF3', 'WORK_INJ', 'HISPANIC', 'RACE', 'LOCATION'],
      dtype='object')

In [30]:
df2.head()

Unnamed: 0,STATE,ST_CASE,VE_FORMS,VEH_NO,PER_NO,STR_VEH,COUNTY,DAY,MONTH,HOUR,...,DEATH_TM,LAG_HRS,LAG_MINS,P_SF1,P_SF2,P_SF3,WORK_INJ,HISPANIC,RACE,LOCATION
0,1,10001,1,1,1,0,127,1,1,2,...,240,0,0,0,0,0,0,7,1,0
1,1,10002,1,1,1,0,83,1,1,22,...,2213,0,0,0,0,0,0,7,1,0
2,1,10003,1,1,1,0,11,1,1,1,...,125,0,0,0,0,0,0,7,2,0
3,1,10003,1,1,2,0,11,1,1,1,...,8888,999,99,0,0,0,8,0,0,0
4,1,10004,1,1,1,0,45,4,1,0,...,57,0,0,0,0,0,0,7,1,0


**Person Type** (appx - codes changed by year)  
1 Driver (in trans)  
2 Passenger (in trans)  
3 Occupant (not in transport)  
4 Occupant - non motor vehicle  
5 Pedestrian  
6 Bicyclist  
7 Other cyclist  
8 Person on personal conveyance  
9 Unknown occupant type (in trans)  
10 Persons in/on buildings  
19 Unknown type of non-motorist

In [40]:
df2.groupby(['PER_TYP'])['ST_CASE'].count()

PER_TYP
1     48613
2     24646
3       286
4        66
5      5795
6       851
7         1
8       167
9       131
10       30
19        1
Name: ST_CASE, dtype: int64

**Injury Severity**  
0 No apparent injury  
1 Possible injury  
2 Suspected minor injury  
3 Suspected serious injury  
4 Fatal injury  
5 Injured, severity unknown  
6 Died prior to crash  
8 Not reported  
9 Unknown

In [34]:
df2.groupby(['INJ_SEV'])['ST_CASE'].count()

INJ_SEV
0    19704
1     7400
2     9045
3     8324
4    35092
5      250
6        2
9      770
Name: ST_CASE, dtype: int64

In [47]:
df2.groupby(['PER_TYP','INJ_SEV'])['ST_CASE'].count()

PER_TYP  INJ_SEV
1        0          13576
         1           3813
         2           4590
         3           3774
         4          22150
         5            103
         6              2
         9            605
2        0           5941
         1           3470
         2           4237
         3           4268
         4           6447
         5            142
         9            141
3        0            161
         1             30
         2             41
         3             17
         4             34
         9              3
4        0              8
         1              9
         2             11
         3             14
         4             24
5        0             15
         1             63
         2            128
         3            205
         4           5376
         5              4
         9              4
6        0              1
         1              3
         2             13
         3             16
         4           

## Vehicle Damage

In [None]:
df3 = pd.read_csv('/2015-traffic-fatalities/damage.csv')