In [None]:
# Traffic prediction ML project
# Imports
import pandas as pd
import numpy as np
import sklearn as sc


In [None]:
'''
Feature engineering notes:
    - Time and date
    - Combine Vehicle continuing direction and vehicle going direction. Oncoming/incoming or same direction
    - The weather and surface cond columns may help each other...
    - Modify Light so that we replace 'Dark -- Unknown Lighting' with 'Dark'
    
'''

# Data Exploration

In [None]:
data = pd.read_csv("traffic.csv", na_values=['Unknown', 'None', 'UNKNOWN'])

In [None]:
# data.shape - 172,105 rows, 43 columns


data['Collision Type'].value_counts()
# 12% other, 717 unknown. We may want to use similar-case imputation

data['Non-Motorist Substance Abuse'].value_counts()
# There are only ~200 data points that actually had substance abuse, so this should be dropped.

data['Driver Distracted By'].value_counts()
# 1/5th of the dataset is missing, over half has drivers that were not distracted. Maybe there's a correlation between distracted driving and severity? Maybe we can train a subset of the data on this to reveal more interesting patterns. Or, we could inject and say that half of the missing is distracted and the other half is not distracted. Or, maybe we should just drop it.

data['Drivers License State'].value_counts()
# correlation between driving out of state lines and being at fault? are drivers closer to home truly more likely to get in an accident? Outliers - FM, NS, MP, SK, etc.

data['Vehicle Movement'].value_counts()
# We need to get creative with our encoding, such as moving quickly, moving at a slower rate, or manuvering (parking, passing, turning, etc)

# This is our class label. We need to OHC different categories.
data['Injury Severity'].value_counts()
# the majority of the dataset isn't injured, fortunately.
# we will just keep the 5 predictors
    # No injury
    # Possible
    # Minor
    # Serious
    # Fatal :(
    
# Driver at fault was dropped because we don't think that the insurance/civil claim really determines how bad the accident is.
# An accident is an accident.
    
# data['Off-Road Description'].unique() # Determine that the values in this column is not important
# data['Driverless Vehicle'].unique() # No driverless vehicles, so column is unneeded
# data['Parked Vehicle'].value_counts() # 2,600 parked vehicles got an in accident. Might be interesting to see if there's a correlation between that and the severity of the accident.
# data['Related Non-Motorist'].value_counts() - There's only about 5000 fields here, it may be too little to make a discernible impact on the dataset.

# Look into what the cross street and route type means
# DROP Report Number, Local Case Number, Off-road description, Road name, Cross-street name, Driverless Vehicle, Municipality (many missing values),  Related Non-Motorist, Non-Motorist Substance Abuse, Person ID, Circumstance (??), Vehicle ID, Vehicle Make, Vehicle Model, Equiptment Problems, Location (as it combines Lat & Long)

# Data Cleaning
## Part 1: Removing unneeded rows

In [None]:
# Feature engineer new columns "Vehicle Dir Changed" and "Vehicle Multiple Impacts"
data['Vehicle Dir Changed'] = np.where(data['Vehicle Continuing Dir'] == data['Vehicle Going Dir'], 'No', 'Yes')
data['Vehicle Multiple Impacts'] = np.where(data['Vehicle First Impact Location'] == data['Vehicle Second Impact Location'], 'No', 'Yes')
#Drop unnecessary data columns
data = data.drop(columns = ['Agency Name', 'Report Number', 'Local Case Number', 'Off-Road Description', 'Road Name', 'Cross-Street Name', 'Driverless Vehicle', 'Municipality', 'Non-Motorist Substance Abuse', 'Person ID', 'Circumstance', 'Vehicle ID', 'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 'Location','Driverless Vehicle', 'Vehicle Continuing Dir', 'Vehicle Going Dir', 'Route Type', 'Vehicle First Impact Location', 'Vehicle Second Impact Location', 'Related Non-Motorist', 'Drivers License State', 'Driver at Fault'], axis = 1)

## Part 2: Feature engineering
### Dealing with the Date/Time column
We want to extrapolate out patterns from the data and time columns.
It would be difficult to train on that column as a whole, but separately it 
can extract out useful patterns.

In [None]:
data.loc[data['Weather'] == 'RAINING', 'Light'] = data.loc[data['Weather'] == 'RAINING', 'Light'].fillna('DARK')
data.loc[data['Weather'] == 'FOGGY', 'Light'] = data.loc[data['Weather'] == 'FOGGY', 'Light'].fillna('DARK')

import datetime
new_weekdays = []
pos = 0
count = 0
for i, date in data['Crash Date/Time'].items() : 
    date_parsed = date.split(' ')
    time = date_parsed[0].split('/')
    obj = datetime.datetime(int(time[2]), int(time[0]), int(time[1]))
    new_weekdays.append(obj.weekday())
    hour = int(date_parsed[1].split(':')[0])

    if pd.isnull(data['Light'][i]) :
        if (hour < 5 or hour > 22) : 
            data['Light'][i] = 'DARK'
        elif (hour >= 5 and hour <= 8): 
            data['Light'][i] = 'DAWN' 
        elif (hour > 8 and hour <= 15) : 
            data['Light'][i] = 'LIGHT'
        elif (hour > 15 and hour <= 22) : 
            data['Light'][i] = 'DUSK'
            
data['Crash Date/Time'] = new_weekdays
df_encoded = pd.get_dummies(data['Crash Date/Time'], columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
data = pd.concat([data, df_encoded] , axis=1)
data = data.drop(columns=['Crash Date/Time'])

data.loc[data['Weather'] == 'CLEAR', 'Surface Condition'] = data.loc[data['Weather'] == 'CLEAR', 'Surface Condition'].fillna('DRY')
data.loc[data['Weather'] == 'RAINING', 'Surface Condition'] = data.loc[data['Weather'] == 'RAINING', 'Surface Condition'].fillna('WET')

data['Surface Condition'].value_counts()
data = data.drop(columns=['Weather'])


### Map out ACRS Report Type to numerical 

In [None]:
data['ACRS Report Type'] = data['ACRS Report Type'].map({'Property Damage Crash': 0, 'Injury Crash': 1, 'Fatal Crash': 2})
data['Vehicle Dir Changed'] = data['Vehicle Dir Changed'].map({'No': 0, 'Yes': 1})
data['Vehicle Multiple Impacts'] = data['Vehicle Multiple Impacts'].map({'No': 0, 'Yes': 1})


data['ACRS Report Type'].value_counts()

### Reducing varation on the features for the Substance Abuse column
#### Group into the following classes
    * Alcohol
    * Illicit Drug
    * Medication
    * None Present

In [None]:
# We are assuming that since the officer did not record any alcohol or drugs, 
# that the driver was not under the influence.

data['Driver Substance Abuse'].fillna('NONE DETECTED', inplace=True)
data['Driver Substance Abuse'].replace('ALCOHOL CONTRIBUTED', 'ALCOHOL', inplace=True)
data['Driver Substance Abuse'].replace('ALCOHOL PRESENT', 'ALCOHOL', inplace=True)
data['Driver Substance Abuse'].replace('ILLEGAL DRUG PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('ILLEGAL DRUG CONTRIBUTED', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINATION PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINATION CONTRIBUTED', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINED SUBSTANCE PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('MEDICATION PRESENT', 'MEDICATION', inplace=True)
data['Driver Substance Abuse'].replace('MEDICATION CONTRIBUTED', 'MEDICATION', inplace=True)
data['Driver Substance Abuse'].replace('OTHER', 'NONE DETECTED', inplace=True)

### Reducing variation on Traffic Control

In [None]:
data['Traffic Control'].replace('STOP SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('FLASHING TRAFFIC SIGNAL', 'SIGN', inplace=True)
data['Traffic Control'].replace('YIELD SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('OTHER', 'SIGN', inplace=True)
data['Traffic Control'].replace('PERSON', 'TRAFFIC SIGNAL', inplace=True)
data['Traffic Control'].replace('WARNING SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('RAILWAY CROSSING DEVICE', 'SIGN', inplace=True)
data['Traffic Control'].replace('SCHOOL ZONE SIGN DEVICE', 'SIGN', inplace=True)

data['Traffic Control'].value_counts()
data['Cross-Street Type'].value_counts()
# data['Traffic Control'].isnull().value_counts()




### Missing values on Collision Type

In [None]:
data['Collision Type'].fillna('OTHER', inplace=True)
data['Collision Type'].replace('SAME DIRECTION SIDESWIPE', 'SIDESWIPE', inplace=True)
data['Collision Type'].replace('OPPOSITE DIRECTION SIDESWIPE', 'SIDESWIPE', inplace=True)
data['Collision Type'].replace('SAME DIRECTION RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('ANGLE MEETS LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('ANGLE MEETS RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR REND LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR REND RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR BOTH LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('OPPOSITE DIR BOTH LEFT TURN', 'HEAD ON', inplace=True)
data['Collision Type'].replace('HEAD ON LEFT TURN', 'HEAD ON', inplace=True)
data['Collision Type'].replace('ANGLE MEETS LEFT HEAD ON', 'HEAD ON', inplace=True)
data['Collision Type'].replace('SAME DIRECTION LEFT TURN', 'TURNING', inplace=True)

data['Collision Type'].value_counts()

In [None]:
# data['Vehicle Movement'].value_counts()
# We've decided to drop out the missing values here given that it's only 0.02% of the database
data.dropna(subset=['Vehicle Movement'], inplace=True)


### Vehicle body type

In [None]:
data['Vehicle Body Type'].value_counts()
# car/station wagon/ police (non emergency)
# van/truck
# bus/heavy truck
# emergency
# motorcycle
# non-road worthy
data.dropna(subset=['Vehicle Body Type'], inplace=True)



### Vehicle Damage Extent

In [None]:
data['Vehicle Damage Extent'].isna().value_counts()
data['Vehicle Damage Extent'].value_counts()
data.dropna(subset=['Vehicle Damage Extent'], inplace=True)

### Driver Distracted By

In [None]:
data['Driver Distracted By'].value_counts()

# data[(data['Driver At Fault'] == 'Yes') and (data['Driver At Fault'].isnull())]
data[data['Driver Subs'] == 'Yes'][data['Driver Distracted By'].isnull()]['Driver Distracted By'].fillna('AT FAULT', inplace=True)

data['Driver Distracted By'].value_counts()



In [None]:
data['same_dir'] = data['Vehicle Movement'].apply(lambda x: 1 if 'SAME' in x else 0)

### Results of the data cleaning

In [None]:
print("before the purge: ", data.shape)
# data.dropna(subset=['Vehicle Movement'], inplace=True)

# data.dropna(inplace=True)
# print("after the purge: ", data.shape)



nulls = data.isnull()
nulls.mean(axis=0)

In [None]:
# cross street, traffic control, 

In [None]:
data['Substance Abuse'].value_counts()

# Machine learning
## Part 1: Traditional models

### Neural Network

In [32]:
from sklearn.neural_network import MLPClassifier



NameError: name 'hidden' is not defined

## Part 2: Ensemble methods