### Pre-Processing

In [1]:
### Importing packages for data analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)

In [2]:
columns_to_drop = ['HOSPITAL', 'EMS_AGENCY', 'BAC_RESULT VALUE', 
                   'CELL_PHONE_USE', 'DATE_POLICE_NOTIFIED', 'PHOTOS_TAKEN_I',
                  'STATEMENTS_TAKEN_I', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'CMRC_VEH_I', 'MAKE', 'MODEL', 'LIC_PLATE_STATE', 
                   'VEHICLE_USE', 'TOWED_I', 'FIRE_I',
                  'EXCEED_SPEED_LIMIT_I', 'TOWED_BY', 'TOWED_TO', 'AREA_00_I', 'AREA_01_I',
                  'AREA_02_I', 'AREA_03_I', 'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I',
                  'AREA_08_I', 'AREA_09_I', 'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I',
                  'CMV_ID', 'USDOT_NO', 'CCMC_NO', 'ILCC_NO', 'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME',
                  'HAZMAT_PRESENT_I', 'HAZMAT_REPORT_I', 'MCS_VIO_CAUSE_CRASH_I', 'TRAILER1_WIDTH',
                  'TRAILER1_LENGTH', 'TOTAL_VEHICLE_LENGTH', 'AXLE_CNT', 'VEHICLE_CONFIG', 
                  'CARGO_BODY_TYPE', 'LOAD_TYPE', 'HAZMAT_OUT_OF_SERVICE_I', 'MCS_OUT_OF_SERVICE_I',
                  'HAZMAT_CLASS', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 
                   'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
                  'INJURIES_UNKNOWN', 'CARRIER_STATE', 'CARRIER_CITY', 'MCS_REPORT_I',
                  'HAZMAT_VIO_CAUSE_CRASH_I'] 

df = pd.read_csv('bikers.csv', low_memory=False)
df = df.drop(columns = columns_to_drop)

### Encoding Categorical Variables

In [14]:
### Encoding INJURY CLASSIFICATION (target) as INJURY_BAD binary
df.loc[df['INJURY_CLASSIFICATION'] == 'INCAPACITATING INJURY', 'INJURY_BAD'] = 1
df.loc[df['INJURY_CLASSIFICATION'] == 'FATAL', 'INJURY_BAD'] = 1
df.loc[df['INJURY_CLASSIFICATION'] == 'REPORTED, NOT EVIDENT', 'INJURY_BAD'] = 0
df.loc[df['INJURY_CLASSIFICATION'] == 'NO INDICATION OF INJURY', 'INJURY_BAD'] = 0
df.loc[df['INJURY_CLASSIFICATION'] == 'NONINCAPACITATING INJURY', 'INJURY_BAD'] = 0


### Encoding SAFETY EQUIPMENT (feature) as HELMET USED binary
df.loc[df['SAFETY_EQUIPMENT'] == 'HELMET USED', 'HELMET_USED'] = 1
df.loc[df['SAFETY_EQUIPMENT'] == 'BICYCLE HELMET (PEDACYCLIST INVOLVED ONLY)', 'HELMET_USED'] = 1
df.loc[df['SAFETY_EQUIPMENT'] == 'HELMET NOT USED', 'HELMET_USED'] = 0
df.loc[df['SAFETY_EQUIPMENT'] == 'NONE PRESENT', 'HELMET_USED'] = 0

### Encoding PEDPEAL VISIBILITY as PROTECTIVE CLOTHING binary
df.loc[df['PEDPEDAL_VISIBILITY'] == 'REFLECTIVE MATERIAL', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'OTHER LIGHT SOURCE USED', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'CONTRASTING CLOTHING', 'PROTECTIVE_CLOTHING'] = 1
df.loc[df['PEDPEDAL_VISIBILITY'] == 'NO CONTRASTING CLOTHING', 'PROTECTIVE_CLOTHING'] = 0

### Encoding POSTED_SPEED_LIMIT as SPEED_RATING w/ ordinal values with 30 as the threshold

def speed_rating(row):  
    if row['POSTED_SPEED_LIMIT'] < 30:
        return 'Below Average Speed'
    elif row['POSTED_SPEED_LIMIT'] == 30:
        return 'Average Speed'
    elif row['POSTED_SPEED_LIMIT'] > 30:
        return 'Above Average Speed'
    return 'Not Available'

df['SPEED_RATING'] = df.apply(lambda row: speed_rating(row), axis=1)

### Encoding LIGHTING_CONDITION as binary values (DAYTIME vs. NIGHTTIME)

df['DAYTIME'] = 1
df.loc[df['LIGHTING_CONDITION'] != 'DAYLIGHT'] = 0

### Encoding WEATHER_CONDITION as categorical variables

def weather_rating(row):  
    if (row['WEATHER_CONDITION'] == 'CLEAR' or row['WEATHER_CONDITION'] == 'CLOUDY/OVERCAST'):
        return 'Clear Weather'
    elif row['WEATHER_CONDITION'] != 'UNKNOWN':
        return 'Adverse Weather'
    return 'Not Available'

df['WEATHER_CAT'] = df.apply(lambda row: weather_rating(row), axis=1)

### Encoding PEDPEDAL_LOCATION as Bike Lane binary

def bike_lane(row):  
    if (row['PEDPEDAL_LOCATION'] == 'BIKEWAY' or row['PEDPEDAL_LOCATION'] == 'BIKE LANE'):
        return 1
    return 0

df['BIKE_LANE'] = df.apply(lambda row: bike_lane(row), axis=1)



### Dropping duplicate variables now that new encoded columns have been used

