In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.externals import joblib
from collections import Counter



Note to reader: This is a nearly-unaltered line-by-line functional data cleaning, faithful to how I actually completed this work, not trying to be tidy or clever for an audience. It is not particularly interesting, nor especially easy to follow. It includes a lot of calls to .info() and .value_counts(), usually to check the number or type of data in a column to better understand the data set or to later make some functional change to those values.

The interpretation of this coded data is available on the SWIRTS website, and a copy is in this github repository for reference.

### Creating Dataframe: Merging collisions and parties data, filtering on "motorcycles involved"

In [5]:
parties = pd.read_pickle('parties.pkl')

In [6]:
collisions = pd.read_msgpack('collisions.msg')

In [46]:
parties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14851757 entries, 0 to 14851756
Data columns (total 20 columns):
CASE_ID                 object
PARTY_NUMBER            int64
PARTY_TYPE              object
AT_FAULT                object
PARTY_SEX               object
PARTY_AGE               int64
PARTY_SOBRIETY          object
PARTY_DRUG_PHYSICAL     object
PARTY_SAFETY_EQUIP_1    object
PARTY_SAFETY_EQUIP_2    object
SP_INFO_2               object
OAF_VIOL_CAT            object
OAF_1                   object
OAF_2                   object
PARTY_NUMBER_KILLED     int64
PARTY_NUMBER_INJURED    int64
MOVE_PRE_ACC            object
VEHICLE_YEAR            float64
STWD_VEHICLE_TYPE       object
INATTENTION             object
dtypes: float64(1), int64(4), object(15)
memory usage: 2.2+ GB


In [44]:
parties.CASE_ID = parties.CASE_ID.astype(str)

In [51]:
collisions.CASE_ID = collisions.CASE_ID.astype(str)

In [47]:
collisions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6982242 entries, 522562 to 7504808
Data columns (total 43 columns):
CASE_ID                    object
ACCIDENT_YEAR              int64
COLLISION_DATE             int64
COLLISION_TIME             int64
DAY_OF_WEEK                int64
INTERSECTION               object
WEATHER_1                  object
WEATHER_2                  object
LOCATION_TYPE              object
RAMP_INTERSECTION          object
COLLISION_SEVERITY         int64
NUMBER_KILLED              float64
NUMBER_INJURED             float64
PARTY_COUNT                int64
PRIMARY_COLL_FACTOR        object
PCF_CODE_OF_VIOL           object
PCF_VIOL_CATEGORY          object
PCF_VIOLATION              object
PCF_VIOL_SUBSECTION        object
TYPE_OF_COLLISION          object
MVIW                       object
PED_ACTION                 object
ROAD_SURFACE               object
ROAD_COND_1                object
ROAD_COND_2                object
LIGHTING                   object
PE

In [52]:
cycle_collisions = collisions[collisions['MOTORCYCLE_ACCIDENT'] == 'Y']

In [55]:
cycle_acc = parties.merge(cycle_collisions, how='left', left_on=parties.CASE_ID, right_on=cycle_collisions.CASE_ID)

In [56]:
cycle_acc.count()

CASE_ID_x                  14851757
PARTY_NUMBER               14851757
PARTY_TYPE                 14851757
AT_FAULT                   14851757
PARTY_SEX                  14851757
PARTY_AGE                  14851757
PARTY_SOBRIETY             14851757
PARTY_DRUG_PHYSICAL        14779340
PARTY_SAFETY_EQUIP_1       14851757
PARTY_SAFETY_EQUIP_2       13814531
SP_INFO_2                  14851757
OAF_VIOL_CAT               14851757
OAF_1                      14851757
OAF_2                      14851757
PARTY_NUMBER_KILLED        14851757
PARTY_NUMBER_INJURED       14851757
MOVE_PRE_ACC               14851757
VEHICLE_YEAR               13551488
STWD_VEHICLE_TYPE          14851757
INATTENTION                  540260
CASE_ID_y                    362366
ACCIDENT_YEAR                362366
COLLISION_DATE               362366
COLLISION_TIME               362366
DAY_OF_WEEK                  362366
INTERSECTION                 362366
WEATHER_1                    362366
WEATHER_2                   

In [57]:
cycle_acc1 = cycle_acc[cycle_acc['MOTORCYCLE_ACCIDENT'] == 'Y']

In [58]:
cycle_acc1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 362366 entries, 3462 to 14424929
Data columns (total 63 columns):
CASE_ID_x                  362366 non-null object
PARTY_NUMBER               362366 non-null int64
PARTY_TYPE                 362366 non-null object
AT_FAULT                   362366 non-null object
PARTY_SEX                  362366 non-null object
PARTY_AGE                  362366 non-null int64
PARTY_SOBRIETY             362366 non-null object
PARTY_DRUG_PHYSICAL        360916 non-null object
PARTY_SAFETY_EQUIP_1       362366 non-null object
PARTY_SAFETY_EQUIP_2       362366 non-null object
SP_INFO_2                  362366 non-null object
OAF_VIOL_CAT               362366 non-null object
OAF_1                      362366 non-null object
OAF_2                      362366 non-null object
PARTY_NUMBER_KILLED        362366 non-null int64
PARTY_NUMBER_INJURED       362366 non-null int64
MOVE_PRE_ACC               362366 non-null object
VEHICLE_YEAR               339677 non-

In [62]:
cycle_acc1.drop('CASE_ID_y', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [59]:
cycle_acc1.to_pickle("cycle_acc1.pkl")

In [2]:
cycle_acc1 = pd.read_pickle("cycle_acc1.pkl")

### Creating Dataframe: Only Motorcycle Drivers

In [76]:
#the above df is motorcycles involved - this one is motorcycles driven. Since we're filtering on drivers, it's not going to
#double-count incidents (presumably each vehicle has only one driver) (unless two motorcycles are in a crash)

In [3]:
cycle_acc1['PARTY_TYPE'] = cycle_acc1['PARTY_TYPE'].astype(str)

In [4]:
cycle_acc1['STWD_VEHICLE_TYPE'] = cycle_acc1['STWD_VEHICLE_TYPE'].astype(str)

In [5]:
cycle_acc1.STWD_VEHICLE_TYPE.value_counts()

C    205754
A    118880
D     19487
-      7158
G      1707
N      1657
F      1622
O      1571
L      1054
J       897
E       881
I       710
M       484
B       322
H       155
K        27
Name: STWD_VEHICLE_TYPE, dtype: int64

In [22]:
cycle_driv = cycle_acc1[(cycle_acc1['PARTY_TYPE'] == '1') & (cycle_acc1['STWD_VEHICLE_TYPE'] == 'C')]

In [7]:
cycle_driv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200833 entries, 3462 to 14424928
Data columns (total 63 columns):
CASE_ID_x                  200833 non-null object
PARTY_NUMBER               200833 non-null int64
PARTY_TYPE                 200833 non-null object
AT_FAULT                   200833 non-null object
PARTY_SEX                  200833 non-null object
PARTY_AGE                  200833 non-null int64
PARTY_SOBRIETY             200833 non-null object
PARTY_DRUG_PHYSICAL        200571 non-null object
PARTY_SAFETY_EQUIP_1       200833 non-null object
PARTY_SAFETY_EQUIP_2       200833 non-null object
SP_INFO_2                  200833 non-null object
OAF_VIOL_CAT               200833 non-null object
OAF_1                      200833 non-null object
OAF_2                      200833 non-null object
PARTY_NUMBER_KILLED        200833 non-null int64
PARTY_NUMBER_INJURED       200833 non-null int64
MOVE_PRE_ACC               200833 non-null object
VEHICLE_YEAR               193739 non-

### Dummy coding variables:

In [23]:
cycle_driv.AT_FAULT.value_counts()

Y    112924
N     87909
Name: AT_FAULT, dtype: int64

In [24]:
cycle_driv.loc[:,'AT_FAULT'].replace(to_replace="Y", value=1, inplace=True)
cycle_driv.AT_FAULT.replace(to_replace="N", value=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [25]:
for S in ['PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT']:
    cycle_driv.loc[:,S].replace(to_replace="Y", value=1, inplace=True)
    cycle_driv.loc[:,S].replace(to_replace="N", value=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [27]:
cycle_driv.ALCOHOL_INVOLVED.value_counts()

Y    16816
Name: ALCOHOL_INVOLVED, dtype: int64

In [40]:
fatalities = cycle_driv.loc[(cycle_driv.NUMBER_KILLED > 0)]
fatalities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6803 entries, 6113 to 14424920
Data columns (total 31 columns):
CASE_ID_x               6803 non-null object
PARTY_NUMBER            6803 non-null int64
AT_FAULT                6803 non-null int64
PARTY_SEX               6803 non-null object
PARTY_AGE               6803 non-null int64
PARTY_SOBRIETY          6803 non-null object
PARTY_DRUG_PHYSICAL     6802 non-null object
PARTY_SAFETY_EQUIP_1    6803 non-null object
PARTY_SAFETY_EQUIP_2    6803 non-null object
SP_INFO_2               6803 non-null object
OAF_VIOL_CAT            6803 non-null object
ACCIDENT_YEAR           6803 non-null float64
COLLISION_DATE          6803 non-null float64
COLLISION_TIME          6803 non-null float64
DAY_OF_WEEK             6803 non-null float64
WEATHER_1               6803 non-null object
NUMBER_KILLED           6803 non-null float64
NUMBER_INJURED          6782 non-null float64
PARTY_COUNT             6803 non-null float64
PRIMARY_COLL_FACTOR     680

In [38]:
fatalities.ALCOHOL_INVOLVED.value_counts()

Y    2092
Name: ALCOHOL_INVOLVED, dtype: int64

In [39]:
fatalities.PARTY_SOBRIETY.value_counts()

A    3532
B    1375
G    1116
-     326
D     248
C     202
H       4
Name: PARTY_SOBRIETY, dtype: int64

In [84]:
cycle_driv.PARTY_SEX.value_counts()

M    181843
F     15465
-      3525
Name: PARTY_SEX, dtype: int64

In [26]:
cols = ['PARTY_SEX', 'PARTY_SOBRIETY', 'PARTY_DRUG_PHYSICAL', 'PARTY_SAFETY_EQUIP_1', 'PARTY_SAFETY_EQUIP_2',
        'SP_INFO_2', 'OAF_VIOL_CAT', 'WEATHER_1',
        'PRIMARY_COLL_FACTOR', 'MVIW', 'ROAD_SURFACE', 'ROAD_COND_1', 'LIGHTING', 'ALCOHOL_INVOLVED'] 
#these columns are going to be dummy-coded

In [98]:
cycle_driv.WEATHER_2.value_counts()

-    198750
C       966
G       517
B       334
F       158
E        95
D        13
Name: WEATHER_2, dtype: int64

In [56]:
cycle_driv.SP_INFO_2.value_counts() #this is.... less than useful

3    151263
-     26635
D     13069
C      7939
2      1484
1       234
B       209
Name: SP_INFO_2, dtype: int64

In [28]:
cycle_driv.ROAD_COND_1.value_counts()

H    190583
D      3150
B      1885
G      1523
-      1492
A      1172
C       802
E       178
F        48
Name: ROAD_COND_1, dtype: int64

In [29]:
cycle_driv.ROAD_COND_2.value_counts()

-    199756
H       548
E       193
D       147
G        89
B        75
C        22
F         3
Name: ROAD_COND_2, dtype: int64

In [90]:
cycle_driv.PARTY_SAFETY_EQUIP_1.value_counts()

P    120503
W     28210
-     22776
M     13375
A      5084
N      3314
G      2378
V      2000
L      1443
B      1111
Y       189
C       110
D        94
U        60
F        55
H        43
E        30
Q        18
X        18
T         7
J         6
R         5
S         2
K         2
Name: PARTY_SAFETY_EQUIP_1, dtype: int64

In [16]:
cycle_driv.PARTY_SAFETY_EQUIP_2.value_counts()

W    130108
-     46900
G     10330
V      5236
A      3936
P      1376
B      1244
Y       434
C       239
N       217
M       195
H       154
E       105
D        65
U        64
Q        61
X        58
L        46
F        20
J        15
K        13
S         8
T         5
R         4
Name: PARTY_SAFETY_EQUIP_2, dtype: int64

In [36]:
cycle_driv.loc[(cycle_driv["PARTY_SAFETY_EQUIP_1"]=='W') & (cycle_driv["PARTY_SAFETY_EQUIP_2"] == 'W')]
#these might just be data entry errors. It looks like the safety equipment section should be multiple sections

Unnamed: 0,CASE_ID_x,PARTY_NUMBER,PARTY_TYPE,AT_FAULT,PARTY_SEX,PARTY_AGE,PARTY_SOBRIETY,PARTY_DRUG_PHYSICAL,PARTY_SAFETY_EQUIP_1,PARTY_SAFETY_EQUIP_2,...,CHP_VEHTYPE_AT_FAULT,COUNT_SEVERE_INJ,COUNT_VISIBLE_INJ,COUNT_COMPLAINT_PAIN,COUNT_PED_KILLED,COUNT_PED_INJURED,COUNT_BICYCLIST_KILLED,COUNT_BICYCLIST_INJURED,COUNT_MC_KILLED,COUNT_MC_INJURED
152786,84657,1,1,1,M,44,A,-,W,W,...,02,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
226329,111898,1,1,1,M,27,A,-,W,W,...,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
507265,271638,2,1,0,M,53,A,-,W,W,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1135767,626370,2,1,0,M,51,A,-,W,W,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1165438,643760,2,1,0,M,27,A,-,W,W,...,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1277039,704625,1,1,0,F,40,A,-,W,W,...,-,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1483368,818997,1,1,1,F,32,A,-,W,W,...,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1899242,1050943,1,1,1,M,26,A,-,W,W,...,02,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
dropcols = ['OAF_1', 'OAF_2', 'PARTY_NUMBER_KILLED', 'PARTY_TYPE', 'STWD_VEHICLE_TYPE',
            'PARTY_NUMBER_INJURED', 'MOVE_PRE_ACC', 'VEHICLE_YEAR', 
            'INATTENTION', 'WEATHER_2', 'LOCATION_TYPE', 
            'RAMP_INTERSECTION', 'INTERSECTION', 'CASE_ID_y', 'COLLISION_SEVERITY',  'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION',
            'PCF_VIOL_SUBSECTION', 'TYPE_OF_COLLISION', 'PED_ACTION', 'ROAD_COND_2', 'NOT_PRIVATE_PROPERTY', 'STWD_VEHTYPE_AT_FAULT',
            'CHP_VEHTYPE_AT_FAULT', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN',
            'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED']
#these columns are either redundant or do not include any useful information, and will be dropped

In [28]:
cycle_driv.drop(dropcols, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [29]:
dums = pd.get_dummies(cycle_driv[cols])

In [30]:
c_driv_d = pd.concat([cycle_driv, dums], axis=1).drop(cols, axis=1)

In [31]:
c_driv_d.head(15)

Unnamed: 0,CASE_ID_x,PARTY_NUMBER,AT_FAULT,PARTY_AGE,ACCIDENT_YEAR,COLLISION_DATE,COLLISION_TIME,DAY_OF_WEEK,NUMBER_KILLED,NUMBER_INJURED,...,ROAD_COND_1_F,ROAD_COND_1_G,ROAD_COND_1_H,LIGHTING_-,LIGHTING_A,LIGHTING_B,LIGHTING_C,LIGHTING_D,LIGHTING_E,ALCOHOL_INVOLVED_Y
3462,1930,1,1,41,2002.0,20020121.0,1620.0,1.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0
3538,1979,2,0,44,2002.0,20020212.0,1244.0,2.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0
3971,2216,2,0,32,2002.0,20020212.0,1636.0,2.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0
4010,2235,2,0,24,2002.0,20020213.0,1812.0,3.0,0.0,1.0,...,0,0,1,0,0,1,0,0,0,0
4136,2297,1,1,42,2002.0,20020201.0,1520.0,5.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0
4271,2368,2,0,21,2002.0,20020213.0,1909.0,3.0,0.0,1.0,...,0,0,1,0,0,0,1,0,0,0
4582,2547,1,1,21,2002.0,20020211.0,2250.0,1.0,0.0,1.0,...,0,0,1,0,0,0,1,0,0,0
4897,2723,1,1,36,2002.0,20020125.0,839.0,5.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
5209,2895,1,0,52,2002.0,20020105.0,1205.0,6.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0
5556,3088,1,1,39,2002.0,20020223.0,1618.0,6.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,0


### looking at top initial correlations:

In [42]:
corrs = c_driv_d.corr().round(2)

In [44]:
n = corrs[['NUMBER_KILLED', 'NUMBER_INJURED']]

In [45]:
s=n.unstack()
so = abs(s).sort_values(kind='quicksort', ascending=False)
so

NUMBER_INJURED  NUMBER_INJURED            1.00
NUMBER_KILLED   NUMBER_KILLED             1.00
                COUNT_MC_KILLED           0.98
NUMBER_INJURED  COUNT_MC_INJURED          0.82
NUMBER_KILLED   PARTY_DRUG_PHYSICAL_E     0.36
                COUNT_MC_INJURED          0.23
NUMBER_INJURED  PARTY_AGE                 0.21
NUMBER_KILLED   PARTY_DRUG_PHYSICAL_-     0.20
                PARTY_SOBRIETY_A          0.18
NUMBER_INJURED  PARTY_SEX_-               0.16
                PARTY_DRUG_PHYSICAL_-     0.16
                COUNT_MC_KILLED           0.16
                NUMBER_KILLED             0.15
                PARTY_SOBRIETY_G          0.15
                PARTY_DRUG_PHYSICAL_G     0.15
NUMBER_KILLED   ALCOHOL_INVOLVED_Y        0.15
                NUMBER_INJURED            0.15
                PARTY_SOBRIETY_B          0.13
NUMBER_INJURED  PARTY_NUMBER              0.12
                PARTY_SOBRIETY_A          0.12
NUMBER_KILLED   OAF_VIOL_CAT_-            0.11
NUMBER_INJURE

In [53]:
p = corrs[['PARTY_SAFETY_EQUIP_1_V', 'PARTY_SAFETY_EQUIP_2_V']]

In [54]:
q=p.unstack()
qo = abs(q).sort_values(kind='quicksort', ascending=False)
qo
#a lot of these are inverses of each other or otherwise obvious - will have to do a more in-depth analysis later

PARTY_SAFETY_EQUIP_1_V  PARTY_SAFETY_EQUIP_1_V    1.00
PARTY_SAFETY_EQUIP_2_V  PARTY_SAFETY_EQUIP_2_V    1.00
                        PARTY_SAFETY_EQUIP_2_W    0.22
PARTY_SAFETY_EQUIP_1_V  PARTY_SAFETY_EQUIP_2_-    0.18
                        SP_INFO_2_3               0.15
                        PARTY_SAFETY_EQUIP_2_W    0.14
                        ACCIDENT_YEAR             0.12
                        COLLISION_DATE            0.12
                        SP_INFO_2_D               0.12
                        PARTY_SAFETY_EQUIP_1_P    0.12
PARTY_SAFETY_EQUIP_2_V  PARTY_SOBRIETY_B          0.10
                        PARTY_SAFETY_EQUIP_2_-    0.09
                        PARTY_SOBRIETY_A          0.09
                        ALCOHOL_INVOLVED_Y        0.09
                        PARTY_SAFETY_EQUIP_1_P    0.08
                        OAF_VIOL_CAT_-            0.08
                        PARTY_SAFETY_EQUIP_1_W    0.07
                        AT_FAULT                  0.07
PARTY_SAFE

In [32]:
cycle_atfault = c_driv_d[(c_driv_d['AT_FAULT'] == 1)]

In [61]:
cycle_atfault.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112924 entries, 3462 to 14424928
Columns: 166 entries, CASE_ID_x to ALCOHOL_INVOLVED_Y
dtypes: float64(9), int64(3), object(9), uint8(145)
memory usage: 34.6+ MB


In [67]:
fault_corrs = cycle_atfault.corr().round(2)

In [68]:
f = fault_corrs[['NUMBER_KILLED', 'NUMBER_INJURED']]

In [70]:
g=f.unstack()
go = abs(g).sort_values(kind='quicksort', ascending=False)
go #interestingly, the results are pretty much identical to the not-necessarily-at-fault ones. 

NUMBER_KILLED   NUMBER_KILLED             1.00
NUMBER_INJURED  NUMBER_INJURED            1.00
NUMBER_KILLED   COUNT_MC_KILLED           0.99
NUMBER_INJURED  COUNT_MC_INJURED          0.86
NUMBER_KILLED   PARTY_DRUG_PHYSICAL_E     0.36
                COUNT_MC_INJURED          0.30
NUMBER_INJURED  PARTY_AGE                 0.28
                COUNT_MC_KILLED           0.24
NUMBER_KILLED   NUMBER_INJURED            0.23
NUMBER_INJURED  PARTY_DRUG_PHYSICAL_-     0.23
                NUMBER_KILLED             0.23
                PARTY_SOBRIETY_G          0.22
                PARTY_SEX_-               0.22
                PARTY_DRUG_PHYSICAL_G     0.22
NUMBER_KILLED   PARTY_DRUG_PHYSICAL_-     0.20
                PARTY_SOBRIETY_A          0.18
                ALCOHOL_INVOLVED_Y        0.16
NUMBER_INJURED  PARTY_SOBRIETY_A          0.16
NUMBER_KILLED   PARTY_SOBRIETY_B          0.14
                OAF_VIOL_CAT_-            0.13
NUMBER_INJURED  PARTY_SAFETY_EQUIP_2_W    0.10
             

In [33]:
c_driv_d.to_pickle('c_driv_d.pkl') #all accidents including motorcycle drivers

In [34]:
cycle_atfault.to_pickle('cycle_atfault.pkl') #accidents where motorcycle drivers were at fault