# __Data Cleaning and Preparation__

### Phase 3 Project - Chicago Traffic Crash Classification

#### Author: Ian Sharff

In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
BIN_FIELDS = ['INTERSECTION_RELATED_I',
              'HIT_AND_RUN_I',
              'WORK_ZONE_I']
CAT_FIELDS = ['TRAFFIC_CONTROL_DEVICE',
              'DEVICE_CONDITION',
              'WEATHER_CONDITION',
              'LIGHTING_CONDITION',
              'FIRST_CRASH_TYPE',
              'TRAFFICWAY_TYPE',
              'ALIGNMENT',
              'ROADWAY_SURFACE_COND',
              'ROAD_DEFECT',
              'CRASH_TYPE',
              'DAMAGE',
              'MOST_SEVERE_INJURY',
              'CRASH_HOUR',
              'CRASH_DAY_OF_WEEK',
              'CRASH_MONTH',
              'CRASH_YEAR']
NUM_FIELDS = ['POSTED_SPEED_LIMIT',
              'NUM_UNITS',
              'INJURIES_TOTAL',
              'INJURIES_FATAL',]

In [3]:
crashes = pd.read_pickle('data/raw/crashes.pkl')

In [4]:
cleaned = crashes.copy()

In [5]:
cleaned['TRAFFIC_CONTROL_DEVICE'].value_counts(dropna=False)

NO CONTROLS                 154056
TRAFFIC SIGNAL               99534
STOP SIGN/FLASHER            35976
NaN                           7259
LANE USE MARKING               916
YIELD                          543
OTHER REG. SIGN                398
RAILROAD CROSSING GATE         230
PEDESTRIAN CROSSING SIGN       177
POLICE/FLAGMAN                 144
SCHOOL ZONE                    112
DELINEATORS                    110
FLASHING CONTROL SIGNAL         98
OTHER RAILROAD CROSSING         88
RR CROSSING SIGN                25
NO PASSING                      22
BICYCLE CROSSING SIGN           11
Name: TRAFFIC_CONTROL_DEVICE, dtype: int64

In [6]:
cleaned['TRAFFIC_CONTROL_DEVICE'] = cleaned['TRAFFIC_CONTROL_DEVICE'].map(lambda x: 0 if x=='NO CONTROLS' else 1, na_action='ignore')

In [7]:
cleaned['TRAFFIC_CONTROL_DEVICE'].value_counts(dropna=False)

0.0    154056
1.0    138752
NaN      7259
Name: TRAFFIC_CONTROL_DEVICE, dtype: int64

In [8]:
cleaned['DEVICE_CONDITION'].value_counts(dropna=False)

NO CONTROLS                 157056
FUNCTIONING PROPERLY        126170
NaN                          13741
FUNCTIONING IMPROPERLY        1863
NOT FUNCTIONING               1026
WORN REFLECTIVE MATERIAL       165
MISSING                         46
Name: DEVICE_CONDITION, dtype: int64

In [9]:
cleaned['DEVICE_CONDITION'] = cleaned['DEVICE_CONDITION'].map(
    lambda x: 0 if x in ['FUNCTIONING IMPROPERLY', 'NOT FUNCTIONING', 'MISSING'] else 1,
    na_action='ignore'
)

In [10]:
cleaned['DEVICE_CONDITION'].value_counts(dropna=False)

1.0    283391
NaN     13741
0.0      2935
Name: DEVICE_CONDITION, dtype: int64

In [11]:
cleaned['WEATHER_CONDITION'].value_counts(dropna=False)

CLEAR                       240146
RAIN                         28882
SNOW                         12746
CLOUDY/OVERCAST              10098
NaN                           6633
FOG/SMOKE/HAZE                 532
SLEET/HAIL                     477
FREEZING RAIN/DRIZZLE          374
BLOWING SNOW                   104
SEVERE CROSS WIND GATE          74
BLOWING SAND, SOIL, DIRT         1
Name: WEATHER_CONDITION, dtype: int64

In [12]:
cleaned['WEATHER_CONDITION'] = cleaned['WEATHER_CONDITION'].map(
    lambda x: 0 if x in ['CLEAR', 'CLOUDY/OVERCAST'] else 1, 
    na_action = 'ignore'
)

In [13]:
cleaned['WEATHER_CONDITION'].value_counts(dropna=False)

0.0    250244
1.0     43190
NaN      6633
Name: WEATHER_CONDITION, dtype: int64

In [14]:
cleaned['LIGHTING_CONDITION'].value_counts(dropna=False)

DAYLIGHT                  201106
DARKNESS, LIGHTED ROAD     67523
DARKNESS                   13401
DUSK                        9404
DAWN                        5078
NaN                         3555
Name: LIGHTING_CONDITION, dtype: int64

In [16]:
with open('data/cleaned/crashes_cleaned.pkl', 'wb') as f:
    pickle.dump(cleaned, f)