In [102]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gmplot as gp
from datetime import datetime

In [103]:
# Read the accident data from csv
violation_df = pd.read_csv('Traffic_Violations-api.csv', 
                           parse_dates = [['Date Of Stop', 'Time Of Stop']],
                           infer_datetime_format = True)


In [104]:
violation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384236 entries, 0 to 1384235
Data columns (total 34 columns):
Date Of Stop_Time Of Stop    1384236 non-null datetime64[ns]
Agency                       1384236 non-null object
SubAgency                    1384226 non-null object
Description                  1384227 non-null object
Location                     1384234 non-null object
Latitude                     1283334 non-null float64
Longitude                    1283334 non-null float64
Accident                     1384236 non-null object
Belts                        1384236 non-null object
Personal Injury              1384236 non-null object
Property Damage              1384236 non-null object
Fatal                        1384236 non-null object
Commercial License           1384236 non-null object
HAZMAT                       1384236 non-null object
Commercial Vehicle           1384236 non-null object
Alcohol                      1384236 non-null object
Work Zone                    

In [105]:
#change the index to datetime
violation_df.index = violation_df['Date Of Stop_Time Of Stop']

#delete data with Nan
viol_df_clean = violation_df.dropna(how = 'any')

#This eliminates unncessary and duplicate columns from the data
viol_df_clean = viol_df_clean.drop(['Agency','Driver State','State', 'Geolocation',
                                  'Driver City','Charge', 'Location'], 
                                   axis = 1)

# transforms the column into two variable columns
viol_df_clean.loc[:,'Race'].replace(to_replace = ['BLACK', 'HISPANIC', 
                                                   'NATIVE AMERICAN', 'OTHER', 'ASIAN'], 
                                                    value = 'Non-WHITE', inplace = True)

viol_df_clean.loc[:, 'VehicleType'].replace(to_replace = ['28 - Other', '05 - Light Duty Truck',
                                                          '10 - Transit Bus', '03 - Station Wagon', 
                                                          '01 - Motorcycle', '20 - Commercial Rig',
                                                          '06 - Heavy Duty Truck','08 - Recreational Vehicle', 
                                                          '04 - Limousine', '07 - Truck/Road Tractor', 
                                                          '19 - Moped', '09 - Farm Vehicle', '25 - Utility Trailer',
                                                          '21 - Tandem Trailer', '26 - Boat Trailer', '12 - School Bus',
                                                          '16 - Fire(Non-Emerg)', '27 - Farm Equipment',
                                                          '11 - Cross Country Bus', '29 - Unknown', '18 - Police Vehicle',
                                                           '13 - Ambulance(Emerg)', '18 - Police(Non-Emerg)', '13 - Ambulance',
                                                           '22 - Mobile Home', '24 - Camper', '23 - Travel/Home Trailer',
                                                           '15 - Fire(Emerg)', '14 - Ambulance(Non-Emerg)',
                                                           '15 - Fire Vehicle', '17 - Police(Emerg)'], 
                                                          value = 'Non-AUTO',
                                                          inplace = True)

viol_df_clean.loc[:,'DL State'].replace(to_replace = ['VA', 'WV', 'DC', 'ME', 'NC', 'OH', 'CA', 'FL', 'XX', 'AZ',
                                                      'NY',  'PA', 'LA', 'WA', 'MA', 'GA', 'IL', 'AR', 'VI', 'OK', 
                                                      'NJ', 'CT', 'HI', 'SC', 'KY', 'TX', 'DE', 'UT', 'NV', 'MO', 
                                                      'MI', 'AL', 'ON',   'OR', 'ND', 'US', 'TN', 'CO', 'RI', 'NH',
                                                      'IA', 'AK', 'IN', 'PR', 'ID', 'NM', 'KS', 'WI', 'MS', 'AB', 
                                                      'VT', 'MB', 'NE', 'MT', 'MN', 'SD', 'SK', 'IT', 'QC','WY', 
                                                      'PE', 'GU', 'MH', 'AS', 'NB', 'BC', 'NS', 'PQ', 'NF'], 
                                                       value = 'Out-ofSTATE', inplace = True)

# This selects the columns that are categorical 
df_categorical = viol_df_clean[['Accident', 'Belts', 'Personal Injury', 'Property Damage', 'Fatal',
                                'Commercial License', 'HAZMAT', 'Commercial Vehicle', 'Alcohol',
                                'Work Zone', 'VehicleType', 'Violation Type', 'Article', 'Contributed To Accident',
                                'Race', 'Gender', 'DL State']]

# copy the categorical columns so that it forgets the origional indices
df_cat = df_categorical.copy(deep =  False)

# Translate the data into dummy binary
df_cat_dumm = pd.get_dummies(df_cat, drop_first = True)

# numerical portion of the data 
df_numerical = viol_df_clean[['Latitude', 'Longitude', 'Year']]

# This data can't be converted into binary dummy but still keep it
df_none = viol_df_clean[['Description', 'Make', 'Model', 'Color' ]]

# combine all the data
df_traffic = pd.concat([df_cat_dumm, df_numerical, df_none], axis = 1)


In [106]:
df_traffic.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1202144 entries, 2017-08-29 10:19:00 to 2018-09-27 08:29:00
Data columns (total 24 columns):
Belts_Yes                         1202144 non-null uint8
Personal Injury_Yes               1202144 non-null uint8
Property Damage_Yes               1202144 non-null uint8
Fatal_Yes                         1202144 non-null uint8
Commercial License_Yes            1202144 non-null uint8
HAZMAT_Yes                        1202144 non-null uint8
Commercial Vehicle_Yes            1202144 non-null uint8
Alcohol_Yes                       1202144 non-null uint8
Work Zone_Yes                     1202144 non-null uint8
VehicleType_Non-AUTO              1202144 non-null uint8
Article_Transportation Article    1202144 non-null uint8
Contributed To Accident_Yes       1202144 non-null uint8
Race_WHITE                        1202144 non-null uint8
Gender_M                          1202144 non-null uint8
Gender_U                          1202144 non-null uint8