In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
import folium
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
with open('data/crashes.pkl', 'rb') as f:
    crashes = pickle.load(f)
crashes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301934 entries, 0 to 526552
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CRASH_RECORD_ID          301934 non-null  object 
 1   CRASH_DATE               301934 non-null  object 
 2   POSTED_SPEED_LIMIT       301934 non-null  int64  
 3   TRAFFIC_CONTROL_DEVICE   294593 non-null  object 
 4   DEVICE_CONDITION         288120 non-null  object 
 5   WEATHER_CONDITION        295279 non-null  object 
 6   LIGHTING_CONDITION       298364 non-null  object 
 7   FIRST_CRASH_TYPE         301934 non-null  object 
 8   TRAFFICWAY_TYPE          291675 non-null  object 
 9   ALIGNMENT                301934 non-null  object 
 10  ROADWAY_SURFACE_COND     289533 non-null  object 
 11  ROAD_DEFECT              263625 non-null  object 
 12  CRASH_TYPE               301934 non-null  object 
 13  INTERSECTION_RELATED_I   85281 non-null   object 
 14  HIT_

In [3]:
crashes.isnull().mean() * 100

CRASH_RECORD_ID             0.000000
CRASH_DATE                  0.000000
POSTED_SPEED_LIMIT          0.000000
TRAFFIC_CONTROL_DEVICE      2.431326
DEVICE_CONDITION            4.575172
WEATHER_CONDITION           2.204124
LIGHTING_CONDITION          1.182378
FIRST_CRASH_TYPE            0.000000
TRAFFICWAY_TYPE             3.397762
ALIGNMENT                   0.000000
ROADWAY_SURFACE_COND        4.107189
ROAD_DEFECT                12.687872
CRASH_TYPE                  0.000000
INTERSECTION_RELATED_I     71.755086
HIT_AND_RUN_I              77.434141
DAMAGE                      0.000000
PRIM_CONTRIBUTORY_CAUSE     0.000000
WORK_ZONE_I                99.235263
NUM_UNITS                   0.000000
MOST_SEVERE_INJURY          0.088099
INJURIES_TOTAL              0.087105
INJURIES_FATAL              0.087105
CRASH_HOUR                  0.000000
CRASH_DAY_OF_WEEK           0.000000
CRASH_MONTH                 0.000000
LATITUDE                    0.618347
LONGITUDE                   0.618347
d

In [5]:
bin_fields = ['INTERSECTION_RELATED_I', 'HIT_AND_RUN_I', 'WORK_ZONE_I']
cat_fields = ['TRAFFIC_CONTROL_DEVICE', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE', 'DAMAGE', 'MOST_SEVERE_INJURY']
num_fields = ['POSTED_SPEED_LIMIT', 'NUM_UNITS', 'INJURIES_TOTAL', 'INJURIES_FATAL']

In [6]:
bin_imputer = SimpleImputer(strategy='constant', fill_value='N')
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='most_frequent')

In [7]:
transformer = ColumnTransformer([
    ('binary', bin_imputer, bin_fields),
    ('categorical', cat_imputer, cat_fields),
    ('numeric', num_imputer, num_fields)
])

In [8]:
transformer.fit_transform(crashes)

array([['N', 'N', 'N', ..., 2.0, 0.0, 0.0],
       ['Y', 'N', 'N', ..., 2.0, 0.0, 0.0],
       ['N', 'N', 'N', ..., 3.0, 0.0, 0.0],
       ...,
       ['N', 'N', 'N', ..., 2.0, 3.0, 0.0],
       ['Y', 'N', 'N', ..., 2.0, 0.0, 0.0],
       ['N', 'Y', 'N', ..., 3.0, 0.0, 0.0]], dtype=object)

In [11]:
pd.DataFrame(Out[8]).isnull().mean()

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
dtype: float64

In [12]:
def prim_contributory_cause(val):
    if val in driving:
        return 'driving'
    elif val in environment:
        return 'environment'
    elif val in disregarding_signs:
        return 'disregarding_signs'
    else:
        return np.NaN

In [13]:
converters = {
    'PRIM_CONTRIBUTORY_CAUSE': prim_contributory_cause
}

In [15]:
crashes = pd.read_csv('data/chicago_crashes.csv',
                      na_values=na_values,
                      infer_datetime_format=True,
                      converters=converters,
                      true_values=['Y'],
                      false_values=['N']
).drop(columns=CRASHES_DROP)

crashes_no_cause = crashes.loc[crashes['PRIM_CONTRIBUTORY_CAUSE'].isna(), :]

crashes.dropna(subset=['PRIM_CONTRIBUTORY_CAUSE'], axis=0, inplace=True)
# crashes.dropna(axis=1, how='any', thresh=0.05 * chi_crashes.shape[0], inplace=True)
crashes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300400 entries, 0 to 526552
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CRASH_RECORD_ID          300400 non-null  object 
 1   CRASH_DATE               300400 non-null  object 
 2   POSTED_SPEED_LIMIT       300400 non-null  int64  
 3   TRAFFIC_CONTROL_DEVICE   293146 non-null  object 
 4   DEVICE_CONDITION         286686 non-null  object 
 5   WEATHER_CONDITION        293806 non-null  object 
 6   LIGHTING_CONDITION       296856 non-null  object 
 7   FIRST_CRASH_TYPE         300400 non-null  object 
 8   TRAFFICWAY_TYPE          290205 non-null  object 
 9   LANE_CNT                 121671 non-null  float64
 10  ALIGNMENT                300400 non-null  object 
 11  ROADWAY_SURFACE_COND     288156 non-null  object 
 12  ROAD_DEFECT              262479 non-null  object 
 13  CRASH_TYPE               300400 non-null  object 
 14  INTE

In [16]:
crashes.PRIM_CONTRIBUTORY_CAUSE.value_counts(dropna=False)

driving               262312
environment            20345
disregarding_signs     17743
Name: PRIM_CONTRIBUTORY_CAUSE, dtype: int64