### Pre-Processing

In [1]:
### Importing packages for data analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)

In [2]:
columns_to_drop = ['HOSPITAL', 'EMS_AGENCY', 'BAC_RESULT VALUE', 
                   'CELL_PHONE_USE', 'DATE_POLICE_NOTIFIED', 'PHOTOS_TAKEN_I',
                  'STATEMENTS_TAKEN_I', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'CMRC_VEH_I', 'MAKE', 'MODEL', 'LIC_PLATE_STATE', 
                   'VEHICLE_USE', 'TOWED_I', 'FIRE_I',
                  'EXCEED_SPEED_LIMIT_I', 'TOWED_BY', 'TOWED_TO', 'AREA_00_I', 'AREA_01_I',
                  'AREA_02_I', 'AREA_03_I', 'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I',
                  'AREA_08_I', 'AREA_09_I', 'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I',
                  'CMV_ID', 'USDOT_NO', 'CCMC_NO', 'ILCC_NO', 'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME',
                  'HAZMAT_PRESENT_I', 'HAZMAT_REPORT_I', 'MCS_VIO_CAUSE_CRASH_I', 'TRAILER1_WIDTH',
                  'TRAILER1_LENGTH', 'TOTAL_VEHICLE_LENGTH', 'AXLE_CNT', 'VEHICLE_CONFIG', 
                  'CARGO_BODY_TYPE', 'LOAD_TYPE', 'HAZMAT_OUT_OF_SERVICE_I', 'MCS_OUT_OF_SERVICE_I',
                  'HAZMAT_CLASS', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 
                   'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
                  'INJURIES_UNKNOWN', 'CARRIER_STATE', 'CARRIER_CITY', 'MCS_REPORT_I',
                  'HAZMAT_VIO_CAUSE_CRASH_I', 'STREET_NO','STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',] 

df = pd.read_csv('bikers.csv', low_memory=False)
df = df.drop(columns = columns_to_drop)

### Encoding Categorical Variables

In [3]:
### Encoding INJURY CLASSIFICATION (target) as INJURY_BAD binary
df.loc[df['INJURY_CLASSIFICATION'] == 'INCAPACITATING INJURY', 'INJURY_BAD'] = 1
df.loc[df['INJURY_CLASSIFICATION'] == 'FATAL', 'INJURY_BAD'] = 1
df.loc[df['INJURY_CLASSIFICATION'] == 'REPORTED, NOT EVIDENT', 'INJURY_BAD'] = 0
df.loc[df['INJURY_CLASSIFICATION'] == 'NO INDICATION OF INJURY', 'INJURY_BAD'] = 0
df.loc[df['INJURY_CLASSIFICATION'] == 'NONINCAPACITATING INJURY', 'INJURY_BAD'] = 0


### Encoding SAFETY EQUIPMENT (feature) as HELMET USED binary
df.loc[df['SAFETY_EQUIPMENT'] == 'HELMET USED', 'HELMET_USED'] = 1
df.loc[df['SAFETY_EQUIPMENT'] == 'BICYCLE HELMET (PEDACYCLIST INVOLVED ONLY)', 'HELMET_USED'] = 1
df.loc[df['SAFETY_EQUIPMENT'] == 'HELMET NOT USED', 'HELMET_USED'] = 0
df.loc[df['SAFETY_EQUIPMENT'] == 'NONE PRESENT', 'HELMET_USED'] = 0

### Encoding PEDPEAL VISIBILITY as PROTECTIVE CLOTHING binary
df.loc[df['PEDPEDAL_VISIBILITY'] == 'REFLECTIVE MATERIAL', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'OTHER LIGHT SOURCE USED', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'CONTRASTING CLOTHING', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'NO CONTRASTING CLOTHING', 'PROTECTIVE_CLOTHING'] = 0

### Encoding POSTED_SPEED_LIMIT as SPEED_RATING w/ ordinal values with 30 as the threshold
def speed_rating(row):  
    if row['POSTED_SPEED_LIMIT'] < 30:
        return 'Below Average Speed'
    elif row['POSTED_SPEED_LIMIT'] == 30:
        return 'Average Speed'
    elif row['POSTED_SPEED_LIMIT'] > 30:
        return 'Above Average Speed'
    return 'Not Available'

df['SPEED_RATING'] = df.apply(lambda row: speed_rating(row), axis=1)

### Encoding LIGHTING_CONDITION as binary values (DAYTIME vs. NIGHTTIME)
df['DAYTIME'] = 1
df.loc[df['LIGHTING_CONDITION'] != 'DAYLIGHT', 'DAYTIME'] = 0

### Encoding WEATHER_CONDITION as categorical variables
def weather_rating(row):  
    if (row['WEATHER_CONDITION'] == 'CLEAR' or row['WEATHER_CONDITION'] == 'CLOUDY/OVERCAST'):
        return 'Clear Weather'
    elif row['WEATHER_CONDITION'] != 'UNKNOWN':
        return 'Adverse Weather'
    return 'Not Available'

df['WEATHER_CAT'] = df.apply(lambda row: weather_rating(row), axis=1)

### Encoding PEDPEDAL_LOCATION as Bike Lane binary
def bike_lane(row):  
    if (row['PEDPEDAL_LOCATION'] == 'BIKEWAY' or row['PEDPEDAL_LOCATION'] == 'BIKE LANE'):
        return 1
    return 0

df['BIKE_LANE'] = df.apply(lambda row: bike_lane(row), axis=1)

### Encoding Primary and Secondary Cause Classifier

cause_obstruction = ['EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST', 'ROAD CONSTRUCTION/MAINTENANCE', 
                     'RELATED TO BUS STOP', 'OBSTRUCTED CROSSWALKS', 'ANIMAL']
cause_not_applicable = ['UNABLE TO DETERMINE', 'NOT APPLICABLE']
cause_reckless = ['EQUIPMENT - VEHICLE CONDITION', 'PHYSICAL CONDITION OF DRIVER', 'IMPROPER BACKING', 
                  'TEXTING', 'DISTRACTION - FROM INSIDE VEHICLE', 'IMPROPER OVERTAKING/PASSING', 
                  'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)', 
                  'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)', 
                  'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER', 
                  'CELL PHONE USE OTHER THAN TEXTING', 'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)']
cause_skill = ['IMPROPER LANE USAGE', 'DISREGARDING TRAFFIC SIGNALS', 'DISREGARDING ROAD MARKINGS', 
               'FAILING TO YIELD RIGHT-OF-WAY', 'BICYCLE ADVANCING LEGALLY ON RED LIGHT', 
               'IMPROPER TURNING/NO SIGNAL', 'TURNING RIGHT ON RED', 'FOLLOWING TOO CLOSELY', 
               'DISREGARDING STOP SIGN', 'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE', 'FAILING TO REDUCE SPEED TO AVOID CRASH', 
               'DISREGARDING OTHER TRAFFIC SIGNS', 'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT', 'DISREGARDING YIELD SIGN']
cause_speed = ['EXCEEDING SAFE SPEED FOR CONDITIONS', 'EXCEEDING AUTHORIZED SPEED LIMIT']
cause_visibility = ['VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)', 'ROAD ENGINEERING/SURFACE/MARKING DEFECTS', 
                    'WEATHER', 'DISTRACTION - FROM OUTSIDE VEHICLE']

def obstruction(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_obstruction or row['SEC_CONTRIBUTORY_CAUSE'] in cause_obstruction:
        return 1
    return 0
def reckless(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_reckless or row['SEC_CONTRIBUTORY_CAUSE'] in cause_reckless:
        return 1
    return 0
def skill(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_skill or row['SEC_CONTRIBUTORY_CAUSE'] in cause_skill:
        return 1
    return 0
def speed(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_speed or row['SEC_CONTRIBUTORY_CAUSE'] in cause_speed:
        return 1
    return 0
def visibility(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_visibility or row['SEC_CONTRIBUTORY_CAUSE'] in cause_visibility:
        return 1
    return 0
def unknown(row):
    if row['PRIM_CONTRIBUTORY_CAUSE'] in cause_not_applicable or row['SEC_CONTRIBUTORY_CAUSE'] in cause_not_applicable:
        return 1
    return 0

df['CAUSE_OBSTRUCTION'] = df.apply(lambda row: obstruction(row), axis=1)
df['CAUSE_RECKLESS'] = df.apply(lambda row: reckless(row), axis=1)
df['CAUSE_SKILL'] = df.apply(lambda row: skill(row), axis=1)
df['CAUSE_SPEED'] = df.apply(lambda row: speed(row), axis=1)
df['CAUSE_VISIBILITY'] = df.apply(lambda row: visibility(row), axis=1)
df['CAUSE_UNKNOWN'] = df.apply(lambda row: unknown(row), axis=1)


### Binning the MANEUVER column

man_straight = ['STRAIGHT AHEAD']
man_turn = ['TURNING LEFT', 'TURNING RIGHT', 'U-TURN', 
            'SLOW/STOP - RIGHT TURN', 'SLOW/STOP - LEFT TURN', 'MERGING', 
            'AVOIDING VEHICLES/OBJECTS', 'TURNING ON RED', 'NEGOTIATING A CURVE']
man_traffic = ['PASSING/OVERTAKING', 'ENTER FROM DRIVE/ALLEY', 'ENTERING TRAFFIC LANE FROM PARKING', 
               'LEAVING TRAFFIC LANE TO PARK', 'CHANGING LANES', 'STARTING IN TRAFFIC', 'BACKING', 
               'SLOW/STOP - LOAD/UNLOAD', 'DRIVING WRONG WAY', 'SKIDDING/CONTROL LOSS']
man_stationary = ['PARKED', 'SLOW/STOP IN TRAFFIC', 'PARKED IN TRAFFIC LANE']
man_unknown = ['UNKNOWN/NA', 'OTHER', 'DRIVERLESS', 'DISABLED', 'DIVERGING']

def straight(row):
    if row['MANEUVER'] in man_straight:
        return 1
    return 0

def turn(row):
    if row['MANEUVER'] in man_turn:
        return 1
    return 0

def traffic(row):
    if row['MANEUVER'] in man_traffic:
        return 1
    return 0

def stationary(row):
    if row['MANEUVER'] in man_stationary:
        return 1
    return 0

def unknown(row):
    if row['MANEUVER'] in man_unknown:
        return 1
    return 0

df['MAN_STRAIGHT'] = df.apply(lambda row: straight(row), axis=1)
df['MAN_TURN'] = df.apply(lambda row: turn(row), axis=1)
df['MAN_TRAFFIC'] = df.apply(lambda row: traffic(row), axis=1)
df['MAN_STATIONARY'] = df.apply(lambda row: stationary(row), axis=1)
df['MAN_UNKNOWN'] = df.apply(lambda row: unknown(row), axis=1)

### Dropping duplicate variables now that new encoded columns have been used

replaced_columns = ['INJURY_CLASSIFICATION', 'SAFETY_EQUIPMENT', 'PEDPEDAL_VISIBILITY', 'POSTED_SPEED_LIMIT', 
                    'LIGHTING_CONDITION', 'WEATHER_CONDITION', 'PEDPEDAL_LOCATION', 'Unnamed: 0', 'MANEUVER']
df = df.drop(columns=replaced_columns)