In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('traffic-crashes.csv')
data.head()

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,23a79931ef555d54118f64dc9be2cf2dbf59636ce253f7...,,09/05/2023 07:05:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DUSK,ANGLE,"FIVE POINT, OR MORE",...,2.0,0.0,2.0,0.0,19,3,9,,,
1,2675c13fd0f474d730a5b780968b3cafc7c12d7adb661f...,,09/22/2023 06:45:00 PM,50,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,DIVIDED - W/MEDIAN BARRIER,...,0.0,0.0,2.0,0.0,18,6,9,,,
2,5f54a59fcb087b12ae5b1acff96a3caf4f2d37e79f8db4...,,07/29/2023 02:45:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,0.0,1.0,0.0,14,7,7,41.85412,-87.665902,POINT (-87.665902342962 41.854120262952)
3,7ebf015016f83d09b321afd671a836d6b148330535d5df...,,08/09/2023 11:00:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,NOT DIVIDED,...,0.0,0.0,2.0,0.0,23,4,8,,,
4,6c1659069e9c6285a650e70d6f9b574ed5f64c12888479...,,08/18/2023 12:50:00 PM,15,OTHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,OTHER,...,1.0,0.0,1.0,0.0,12,6,8,,,


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# checking null values
sorted_na = data.isna().sum()/data.shape[0]
to_drop = sorted_na[sorted_na>0.6].sort_values(ascending=False).index
data.drop(to_drop, inplace=True, axis=1)

In [5]:
to_drop

Index(['WORKERS_PRESENT_I', 'DOORING_I', 'WORK_ZONE_TYPE', 'WORK_ZONE_I',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'NOT_RIGHT_OF_WAY_I',
       'CRASH_DATE_EST_I', 'INTERSECTION_RELATED_I', 'LANE_CNT',
       'HIT_AND_RUN_I'],
      dtype='object')

In [6]:
data.nunique()

CRASH_RECORD_ID                  794956
CRASH_DATE                       521033
POSTED_SPEED_LIMIT                   45
TRAFFIC_CONTROL_DEVICE               19
DEVICE_CONDITION                      8
WEATHER_CONDITION                    12
LIGHTING_CONDITION                    6
FIRST_CRASH_TYPE                     18
TRAFFICWAY_TYPE                      20
ALIGNMENT                             6
ROADWAY_SURFACE_COND                  7
ROAD_DEFECT                           7
REPORT_TYPE                           3
CRASH_TYPE                            2
DAMAGE                                3
DATE_POLICE_NOTIFIED             603015
PRIM_CONTRIBUTORY_CAUSE              40
SEC_CONTRIBUTORY_CAUSE               40
STREET_NO                         11694
STREET_DIRECTION                      4
STREET_NAME                        1632
BEAT_OF_OCCURRENCE                  276
NUM_UNITS                            17
MOST_SEVERE_INJURY                    5
INJURIES_TOTAL                       20


#### Dealing with variables
1. drop the id
1. drop location
1. bin lat and long into far east, east, central, west, and far west west etc.
1. figure out date later

In [7]:
to_drop = ['CRASH_RECORD_ID', 'LOCATION', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH']
data.drop(to_drop, inplace=True, axis=1)

parsing date

In [8]:
data.CRASH_DATE = data.CRASH_DATE.apply(lambda x: pd.to_datetime(x, format="%m/%d/%Y %I:%M:%S %p"))

In [9]:
# function to assign the time of day using time as imput

import datetime
def get_TOD(time):
    if time < datetime.time(4):
        return 'Night'
    elif time < datetime.time(10):
        return 'Morning'
    elif time < datetime.time(15):
        return 'Noon'
    elif time < datetime.time(19):
        return 'Evening'
    else:
        return 'Night'

In [10]:
#creating a new columnn that contains the time of day of the crash
data['CRASH_TIME_OF_DAY'] = data.CRASH_DATE.apply(lambda x: get_TOD(x.time()))
#creating a new columnn that contains the month of the crash
data['CRASH_MONTH'] = data.CRASH_DATE.apply(lambda x : x.month_name())
#creating a new columnn that contains the day of the crash
data['CRASH_DAY_OF_WEEK'] = data.CRASH_DATE.apply(lambda x : x.day_name())

Converting column names to lowercase because I feel like I am yelling a lot XD

In [11]:
data.columns = [x for x in pd.Series([x for x in data.columns]).apply(lambda x : x.lower())]

In [12]:
data['injury_flag'] = data.injuries_total.apply(lambda x : 0 if x<1 else 1)

binning latitudes and longitudes

In [13]:
data.latitude = pd.qcut(data.latitude, q=5,labels=['Far South', 'South', 'Central', 'North', 'Far North'])
data.longitude = pd.qcut(data.longitude, q=5,labels=['Far West', 'West', 'Central', 'East', 'Far East'])

because of the high degree of correlation we will drop columns related to injuries

In [14]:
data.drop([x for x in data.columns if 'injuries' in x], axis=1, inplace=True)

In [16]:
data.select_dtypes(object)

Unnamed: 0,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,report_type,...,damage,date_police_notified,prim_contributory_cause,sec_contributory_cause,street_direction,street_name,most_severe_injury,crash_time_of_day,crash_month,crash_day_of_week
0,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DUSK,ANGLE,"FIVE POINT, OR MORE",STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",09/05/2023 07:05:00 PM,UNABLE TO DETERMINE,NOT APPLICABLE,S,WENTWORTH AVE,INCAPACITATING INJURY,Night,September,Tuesday
1,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",09/22/2023 06:50:00 PM,FOLLOWING TOO CLOSELY,FOLLOWING TOO CLOSELY,S,CHICAGO SKYWAY OB,NO INDICATION OF INJURY,Evening,September,Friday
2,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",07/29/2023 02:45:00 PM,FAILING TO REDUCE SPEED TO AVOID CRASH,"OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELE...",S,ASHLAND AVE,NO INDICATION OF INJURY,Noon,July,Saturday
3,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",08/09/2023 11:40:00 PM,FAILING TO YIELD RIGHT-OF-WAY,NOT APPLICABLE,W,BALMORAL AVE,NO INDICATION OF INJURY,Night,August,Wednesday
4,OTHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,OTHER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",08/18/2023 12:55:00 PM,FOLLOWING TOO CLOSELY,DISTRACTION - FROM INSIDE VEHICLE,W,OHARE ST,NONINCAPACITATING INJURY,Noon,August,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794951,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NOT ON SCENE (DESK REPORT),...,"$501 - $1,500",11/06/2023 02:12:00 PM,IMPROPER BACKING,UNABLE TO DETERMINE,W,BELDEN AVE,NO INDICATION OF INJURY,Morning,November,Monday
794952,NO CONTROLS,NO CONTROLS,UNKNOWN,UNKNOWN,SIDESWIPE SAME DIRECTION,OTHER,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NOT ON SCENE (DESK REPORT),...,"$501 - $1,500",11/07/2023 07:14:00 PM,UNABLE TO DETERMINE,NOT APPLICABLE,S,STONY ISLAND AVE,NO INDICATION OF INJURY,Evening,October,Tuesday
794953,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NOT ON SCENE (DESK REPORT),...,"OVER $1,500",11/07/2023 04:50:00 PM,IMPROPER BACKING,IMPROPER BACKING,E,104TH ST,NO INDICATION OF INJURY,Night,November,Saturday
794954,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,ON SCENE,...,"OVER $1,500",11/06/2023 06:40:00 PM,UNABLE TO DETERMINE,NOT APPLICABLE,W,63RD ST,NO INDICATION OF INJURY,Evening,November,Monday
