In [444]:
# Traffic prediction ML project
# Imports
import pandas as pd
import numpy as np
import sklearn as sc


In [445]:
'''
Feature engineering notes:
    - Time and date
    - Combine Vehicle continuing direction and vehicle going direction. Oncoming/incoming or same direction
    - The weather and surface cond columns may help each other...
    - Modify Light so that we replace 'Dark -- Unknown Lighting' with 'Dark'
    
'''

"\nFeature engineering notes:\n    - Time and date\n    - Combine Vehicle continuing direction and vehicle going direction. Oncoming/incoming or same direction\n    - The weather and surface cond columns may help each other...\n    - Modify Light so that we replace 'Dark -- Unknown Lighting' with 'Dark'\n    \n"

# Data Exploration

In [446]:
data = pd.read_csv("traffic.csv", na_values=['Unknown', 'None', 'UNKNOWN'])

  data = pd.read_csv("traffic.csv", na_values=['Unknown', 'None', 'UNKNOWN'])


In [447]:
# data.shape - 172,105 rows, 43 columns


data['Collision Type'].value_counts()
# 12% other, 717 unknown. We may want to use similar-case imputation

data['Non-Motorist Substance Abuse'].value_counts()
# There are only ~200 data points that actually had substance abuse, so this should be dropped.

data['Driver Distracted By'].value_counts()
# 1/5th of the dataset is missing, over half has drivers that were not distracted. Maybe there's a correlation between distracted driving and severity? Maybe we can train a subset of the data on this to reveal more interesting patterns. Or, we could inject and say that half of the missing is distracted and the other half is not distracted. Or, maybe we should just drop it.

data['Drivers License State'].value_counts()
# correlation between driving out of state lines and being at fault? are drivers closer to home truly more likely to get in an accident? Outliers - FM, NS, MP, SK, etc.

data['Vehicle Movement'].value_counts()
# We need to get creative with our encoding, such as moving quickly, moving at a slower rate, or manuvering (parking, passing, turning, etc)

# This is our class label. We need to OHC different categories.
data['Injury Severity'].value_counts()
# the majority of the dataset isn't injured, fortunately.
# we will just keep the 5 predictors
    # No injury
    # Possible
    # Minor
    # Serious
    # Fatal :(
    
# Driver at fault was dropped because we don't think that the insurance/civil claim really determines how bad the accident is.
# An accident is an accident.
    
# data['Off-Road Description'].unique() # Determine that the values in this column is not important
# data['Driverless Vehicle'].unique() # No driverless vehicles, so column is unneeded
# data['Parked Vehicle'].value_counts() # 2,600 parked vehicles got an in accident. Might be interesting to see if there's a correlation between that and the severity of the accident.
# data['Related Non-Motorist'].value_counts() - There's only about 5000 fields here, it may be too little to make a discernible impact on the dataset.

# Look into what the cross street and route type means
# DROP Report Number, Local Case Number, Off-road description, Road name, Cross-street name, Driverless Vehicle, Municipality (many missing values),  Related Non-Motorist, Non-Motorist Substance Abuse, Person ID, Circumstance (??), Vehicle ID, Vehicle Make, Vehicle Model, Equiptment Problems, Location (as it combines Lat & Long)


Injury Severity
NO APPARENT INJURY          141185
POSSIBLE INJURY              17482
SUSPECTED MINOR INJURY       11870
SUSPECTED SERIOUS INJURY      1415
FATAL INJURY                   153
Name: count, dtype: int64

# Data Cleaning
## Part 1: Removing unneeded rows

In [448]:
# Feature engineer new columns "Vehicle Dir Changed" and "Vehicle Multiple Impacts"
data['Vehicle Dir Changed'] = np.where(data['Vehicle Continuing Dir'] == data['Vehicle Going Dir'], 'No', 'Yes')
data['Vehicle Multiple Impacts'] = np.where(data['Vehicle First Impact Location'] == data['Vehicle Second Impact Location'], 'No', 'Yes')
#Drop unnecessary data columns
data = data.drop(columns = ['Agency Name', 'Report Number', 'Local Case Number', 'Off-Road Description', 'Road Name', 'Cross-Street Name', 'Driverless Vehicle', 'Municipality', 'Non-Motorist Substance Abuse', 'Person ID', 'Circumstance', 'Vehicle ID', 'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 'Location','Driverless Vehicle', 'Vehicle Continuing Dir', 'Vehicle Going Dir', 'Route Type', 'Vehicle First Impact Location', 'Vehicle Second Impact Location', 'Related Non-Motorist', 'Drivers License State'], axis = 1)

## Part 2: Feature engineering
### Dealing with the Date/Time column
We want to extrapolate out patterns from the data and time columns.
It would be difficult to train on that column as a whole, but separately it 
can extract out useful patterns.

In [449]:
data.loc[data['Weather'] == 'RAINING', 'Light'] = data.loc[data['Weather'] == 'RAINING', 'Light'].fillna('DARK')
data.loc[data['Weather'] == 'FOGGY', 'Light'] = data.loc[data['Weather'] == 'FOGGY', 'Light'].fillna('DARK')

import datetime
new_weekdays = []
pos = 0
count = 0
for i, date in data['Crash Date/Time'].items() : 
    date_parsed = date.split(' ')
    time = date_parsed[0].split('/')
    obj = datetime.datetime(int(time[2]), int(time[0]), int(time[1]))
    new_weekdays.append(obj.weekday())
    hour = int(date_parsed[1].split(':')[0])

    if pd.isnull(data['Light'][i]) :
        if (hour < 5 or hour > 22) : 
            data['Light'][i] = 'DARK'
        elif (hour >= 5 and hour <= 8): 
            data['Light'][i] = 'DAWN' 
        elif (hour > 8 and hour <= 15) : 
            data['Light'][i] = 'LIGHT'
        elif (hour > 15 and hour <= 22) : 
            data['Light'][i] = 'DUSK'
            
data['Crash Date/Time'] = new_weekdays
df_encoded = pd.get_dummies(data['Crash Date/Time'], columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
data = pd.concat([data, df_encoded] , axis=1)
data = data.drop(columns=['Crash Date/Time'])

# This reduces surface cond. NA/s by about 16,000
data.loc[data['Weather'] == 'CLEAR', 'Surface Condition'] = data.loc[data['Weather'] == 'CLEAR', 'Surface Condition'].fillna('DRY')
data.loc[data['Weather'] == 'RAINING', 'Surface Condition'] = data.loc[data['Weather'] == 'RAINING', 'Surface Condition'].fillna('WET')
# It should be safe to drop the rest. 
data.dropna(subset=['Surface Condition'], inplace=True)


data['Surface Condition'].value_counts()
data = data.drop(columns=['Weather'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Light'][i] = 'DARK'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Light'][i] = 'DARK'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Light'][i] = 'LIGHT'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Light'][i] = 'LIGHT'
A value is trying to be set on a copy of a slice from a DataFr

### Map out ACRS Report Type to numerical 

In [450]:
data['ACRS Report Type'] = data['ACRS Report Type'].map({'Property Damage Crash': 0, 'Injury Crash': 1, 'Fatal Crash': 2})
data['Vehicle Dir Changed'] = data['Vehicle Dir Changed'].map({'No': 0, 'Yes': 1})
data['Vehicle Multiple Impacts'] = data['Vehicle Multiple Impacts'].map({'No': 0, 'Yes': 1})


data['ACRS Report Type'].value_counts()

ACRS Report Type
0    104989
1     61097
2       423
Name: count, dtype: int64

### Reducing varation on the features for the Substance Abuse column
#### Group into the following classes
    * Alcohol
    * Illicit Drug
    * Medication
    * None Present

In [451]:
# We are assuming that since the officer did not record any alcohol or drugs, 
# that the driver was not under the influence.

data['Driver Substance Abuse'].fillna('NONE DETECTED', inplace=True)
data['Driver Substance Abuse'].replace('ALCOHOL CONTRIBUTED', 'ALCOHOL', inplace=True)
data['Driver Substance Abuse'].replace('ALCOHOL PRESENT', 'ALCOHOL', inplace=True)
data['Driver Substance Abuse'].replace('ILLEGAL DRUG PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('ILLEGAL DRUG CONTRIBUTED', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINATION PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINATION CONTRIBUTED', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('COMBINED SUBSTANCE PRESENT', 'ILLICIT DRUG', inplace=True)
data['Driver Substance Abuse'].replace('MEDICATION PRESENT', 'MEDICATION', inplace=True)
data['Driver Substance Abuse'].replace('MEDICATION CONTRIBUTED', 'MEDICATION', inplace=True)
data['Driver Substance Abuse'].replace('OTHER', 'NONE DETECTED', inplace=True)

### Reducing variation on Traffic Control

In [452]:
data['Traffic Control'].replace('STOP SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('FLASHING TRAFFIC SIGNAL', 'SIGN', inplace=True)
data['Traffic Control'].replace('YIELD SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('OTHER', 'SIGN', inplace=True)
data['Traffic Control'].replace('PERSON', 'TRAFFIC SIGNAL', inplace=True)
data['Traffic Control'].replace('WARNING SIGN', 'SIGN', inplace=True)
data['Traffic Control'].replace('RAILWAY CROSSING DEVICE', 'SIGN', inplace=True)
data['Traffic Control'].replace('SCHOOL ZONE SIGN DEVICE', 'SIGN', inplace=True)

data['Traffic Control'].value_counts()
data['Cross-Street Type'].value_counts()
# data['Traffic Control'].isnull().value_counts()




Cross-Street Type
County                  84483
Maryland (State)        23992
Municipality            18935
Ramp                     5844
Other Public Roadway     3770
US (State)               1436
Government               1168
Interstate (State)        673
Service Road               87
Name: count, dtype: int64

### Missing values on Collision Type

In [453]:
data['Collision Type'].fillna('OTHER', inplace=True)
data['Collision Type'].replace('SAME DIRECTION SIDESWIPE', 'SIDESWIPE', inplace=True)
data['Collision Type'].replace('OPPOSITE DIRECTION SIDESWIPE', 'SIDESWIPE', inplace=True)
data['Collision Type'].replace('SAME DIRECTION RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('ANGLE MEETS LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('ANGLE MEETS RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR REND LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR REND RIGHT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('SAME DIR BOTH LEFT TURN', 'TURNING', inplace=True)
data['Collision Type'].replace('OPPOSITE DIR BOTH LEFT TURN', 'HEAD ON', inplace=True)
data['Collision Type'].replace('HEAD ON LEFT TURN', 'HEAD ON', inplace=True)
data['Collision Type'].replace('ANGLE MEETS LEFT HEAD ON', 'HEAD ON', inplace=True)
data['Collision Type'].replace('SAME DIRECTION LEFT TURN', 'TURNING', inplace=True)

data['Collision Type'].value_counts()

Collision Type
SAME DIR REAR END          54915
STRAIGHT MOVEMENT ANGLE    29698
SIDESWIPE                  18461
OTHER                      18219
HEAD ON                    17420
SINGLE VEHICLE             15057
TURNING                    12739
Name: count, dtype: int64

In [454]:
# data['Vehicle Movement'].value_counts()
# We've decided to drop out the missing values here given that it's only 0.02% of the database
data.dropna(subset=['Vehicle Movement'], inplace=True)


### Vehicle body type

In [455]:
data['Vehicle Body Type'].fillna('OTHER', inplace=True)

data['Vehicle Body Type'].replace('PASSENGER CAR', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('FARM VEHICLE', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('LIMOUSINE', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('(SPORT) UTILITY VEHICLE', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('POLICE VEHICLE/NON EMERGENCY', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('STATION WAGON', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('PICKUP TRUCK', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('VAN', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('OTHER BUS', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('TRUCK TRACTOR', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('FIRE VEHICLE/NON EMERGENCY', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('AMBULANCE/NON EMERGENCY', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('TRANSIT BUS', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('CROSS COUNTRY BUS', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('SCHOOL BUS', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('RECREATIONAL VEHICLE', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('(SPORT) UTILITY VEHICLE', 'CAR/SUV/NON-EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('OTHER LIGHT TRUCKS (10,000LBS (4,536KG) OR LESS)', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('CARGO VAN/LIGHT TRUCK 2 AXLES (OVER 10,000LBS (4,536 KG))', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('MEDIUM/HEAVY TRUCKS 3 AXLES (OVER 10,000LBS (4,536KG))', 'VAN/TRUCK', inplace=True)
data['Vehicle Body Type'].replace('AMBULANCE/EMERGENCY', 'EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('FIRE VEHICLE/EMERGENCY', 'EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('POLICE VEHICLE/EMERGENCY', 'EMERGENCY', inplace=True)
data['Vehicle Body Type'].replace('MOPED', 'MOTORCYCLE', inplace=True)
data['Vehicle Body Type'].replace('ALL TERRAIN VEHICLE (ATV)', 'MOTORCYCLE', inplace=True)
data['Vehicle Body Type'].replace('SNOWMOBILE', 'MOTORCYCLE', inplace=True)
data['Vehicle Body Type'].replace('LOW SPEED VEHICLE', 'MOTORCYCLE', inplace=True)
data['Vehicle Body Type'].replace('AUTOCYCLE', 'MOTORCYCLE', inplace=True)


data['Vehicle Body Type'].isna().value_counts()

# car/station wagon/ police (non emergency)
# van/truck
# bus/heavy truck
# emergency
# motorcycle
# non-road worthy

Vehicle Body Type
False    163785
Name: count, dtype: int64

### Vehicle Damage Extent

In [456]:
data['Vehicle Damage Extent'].isna().value_counts()
data['Vehicle Damage Extent'].value_counts()
data.dropna(subset=['Vehicle Damage Extent'], inplace=True)

data['Driver Distracted By'].isna().value_counts()

Driver Distracted By
False    133830
True      24749
Name: count, dtype: int64

### Driver Distracted By

In [457]:
data.loc[(data['Driver Substance Abuse'] != 'NONE DETECTED') & (data['Driver Distracted By'].isna()), 'Driver Distracted By'] = 'substance abuse'

# This could be very problamatic. It might not be a great indicator of distraction if the driver was at fault (i.e. road rage)
# Do the ML and see if it's good or not!!
data.loc[(data['Driver At Fault'] == 'Yes') & (data['Driver Distracted By'].isna()), 'Driver Distracted By'] = 'AT FAULT'

data.dropna(subset=['Driver Distracted By'], inplace=True)

data = data.drop(columns=['Driver At Fault'])

# After we've engineered the features into one, we can drop the distraction column.
data['Driver Distracted By'].value_counts()



Driver Distracted By
NOT DISTRACTED                                       102918
LOOKED BUT DID NOT SEE                                20544
AT FAULT                                              19414
INATTENTIVE OR LOST IN THOUGHT                         4059
OTHER DISTRACTION                                      3050
substance abuse                                        2004
DISTRACTED BY OUTSIDE PERSON OBJECT OR EVENT            939
BY OTHER OCCUPANTS                                      394
OTHER CELLULAR PHONE RELATED                            360
OTHER ELECTRONIC DEVICE (NAVIGATIONAL PALM PILOT)       317
TALKING OR LISTENING TO CELLULAR PHONE                  256
BY MOVING OBJECT IN VEHICLE                             207
EATING OR DRINKING                                      192
NO DRIVER PRESENT                                       187
ADJUSTING AUDIO AND OR CLIMATE CONTROLS                 130
USING OTHER DEVICE CONTROLS INTEGRAL TO VEHICLE          88
USING DEVICE OBJECT

In [458]:
data['same_dir'] = data['Vehicle Movement'].apply(lambda x: 1 if 'SAME' in x else 0)

In [459]:
# import requests

# main_url = 'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?'
# benchmark = 'Public_Ar_Census2020'
# format = 'JSON'
# vintage = 'Census2010_Census2020'
# data_sub = data[0:20]
# populations = []
# for i in data_sub.iterrows() :
#     test_x = i[1]['Longitude']
#     test_y = i[1]['Latitude']
#     response = requests.get(f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={test_x}&y={test_y}&benchmark={benchmark}&format={format}&vintage={vintage}")
#     json = response.json()
#     # print(json['result']['geographies']['County Subdivisions'][0]['POP100'])
#     populations.append(json['result']['geographies']['County Subdivisions'][0]['POP100'])



# # print (json.values()['geographies']['Census Designated Places']) 

# # ['geographies']['Census Designated Places']

### Results of the data cleaning

In [460]:
print("hello world: ", data.shape)
# data.dropna(subset=['Vehicle Movement'], inplace=True)

# data.dropna(inplace=True)
# print("after the purge: ", data.shape)

pd.set_option('display.max_colwidth', None)

nulls = data.isnull()
nulls.mean(axis=0)

# data[['Longitude', 'Latitude']]

hello world:  (155248, 27)


ACRS Report Type            0.000000
Cross-Street Type           0.151577
Collision Type              0.000000
Surface Condition           0.000000
Light                       0.000000
Traffic Control             0.138772
Driver Substance Abuse      0.000000
Injury Severity             0.000000
Driver Distracted By        0.000000
Vehicle Damage Extent       0.000000
Vehicle Body Type           0.000000
Vehicle Movement            0.000000
Speed Limit                 0.000000
Parked Vehicle              0.000000
Vehicle Year                0.000000
Latitude                    0.000000
Longitude                   0.000000
Vehicle Dir Changed         0.000000
Vehicle Multiple Impacts    0.000000
0                           0.000000
1                           0.000000
2                           0.000000
3                           0.000000
4                           0.000000
5                           0.000000
6                           0.000000
same_dir                    0.000000
d

In [461]:
# cross street, traffic control, 

In [462]:
data['Substance Abuse'].value_counts()

KeyError: 'Substance Abuse'

# Machine learning
## Part 1: Traditional models

### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier



NameError: name 'hidden' is not defined

## Part 2: Ensemble methods