# Preliminary Modeling

In [19]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from raw_to_transformed_data import get_sql_data

## Importing data

In [3]:
query_crashes = """
    SELECT *
    FROM crashes;
    """
dbname = "chi-traffic-accidents"

In [4]:
df_crashes = get_sql_data(dbname, query_crashes)

In [9]:
drop_cols = ['crash_record_id', 'crash_date', 'report_type', 'prim_contributory_cause', 'intersection_related_i', 'hit_and_run_i', 'lane_cnt', 'has_injuries']
df_crashes = df_crashes.drop(columns=drop_cols)

df_crashes = df_crashes.rename(columns={"crash_day_of_week": "crash_day"})


In [42]:
df_crashes["street_direction"][df_crashes["street_direction"].isna()] = df_crashes["street_direction"].mode()[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crashes["street_direction"][df_crashes["street_direction"].isna()] = df_crashes["street_direction"].mode()[0]


'W'

In [22]:
y = df_crashes.pop("injuries_total")

In [24]:
y[:5]

0    0.0
1    1.0
2    0.0
3    1.0
4    2.0
Name: injuries_total, dtype: float64

In [25]:
X = df_crashes.copy()

In [27]:
X[:5]

Unnamed: 0,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,street_direction,num_units,crash_hour,crash_day,crash_month
0,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,3,Sunday,August
1,35,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,N,2,2,Sunday,August
2,25,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,2,Sunday,August
3,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,OTHER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,S,2,2,Sunday,August
4,30,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,W,3,2,Sunday,August


# Transforming data

In [21]:
encoder=OneHotEncoder(drop=None, sparse=True)

In [28]:
onehot_crashes = encoder.fit_transform(X)

In [29]:
onehot_crashes

<526569x208 sparse matrix of type '<class 'numpy.float64'>'
	with 7898535 stored elements in Compressed Sparse Row format>

In [33]:
encoder.categories_

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 14, 15, 18, 20, 22,
        23, 24, 25, 26, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 45, 49,
        50, 55, 60, 63, 65, 70, 99]),
 array(['BICYCLE CROSSING SIGN', 'DELINEATORS', 'FLASHING CONTROL SIGNAL',
        'LANE USE MARKING', 'NO CONTROLS', 'NO PASSING', 'OTHER',
        'PEDESTRIAN CROSSING SIGN', 'POLICE/FLAGMAN',
        'RAILROAD CROSSING GATE', 'RR CROSSING SIGN', 'SCHOOL ZONE',
        'STOP SIGN/FLASHER', 'TRAFFIC SIGNAL', 'UNKNOWN', 'YIELD'],
       dtype=object),
 array(['FUNCTIONING IMPROPERLY', 'FUNCTIONING PROPERLY', 'MISSING',
        'NO CONTROLS', 'NOT FUNCTIONING', 'OTHER', 'UNKNOWN',
        'WORN REFLECTIVE MATERIAL'], dtype=object),
 array(['BLOWING SAND, SOIL, DIRT', 'BLOWING SNOW', 'CLEAR',
        'CLOUDY/OVERCAST', 'FOG/SMOKE/HAZE', 'FREEZING RAIN/DRIZZLE',
        'OTHER', 'RAIN', 'SEVERE CROSS WIND GATE', 'SLEET/HAIL', 'SNOW',
        'UNKNOWN'], dtype=object),
 array(['DARKNESS', 'DARKNESS, LIG

In [34]:
onehot_crashes.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
df_crashes.head()

Unnamed: 0,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,street_direction,num_units,injuries_total,crash_hour,crash_day,crash_month
0,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,0.0,3,Sunday,August
1,35,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,N,2,1.0,2,Sunday,August
2,25,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,0.0,2,Sunday,August
3,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,OTHER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,S,2,1.0,2,Sunday,August
4,30,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,W,3,2.0,2,Sunday,August


In [35]:
df_crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526569 entries, 0 to 526568
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   posted_speed_limit      526569 non-null  int64 
 1   traffic_control_device  526569 non-null  object
 2   device_condition        526569 non-null  object
 3   weather_condition       526569 non-null  object
 4   lighting_condition      526569 non-null  object
 5   first_crash_type        526569 non-null  object
 6   trafficway_type         526569 non-null  object
 7   alignment               526569 non-null  object
 8   roadway_surface_cond    526569 non-null  object
 9   road_defect             526569 non-null  object
 10  street_direction        526566 non-null  object
 11  num_units               526569 non-null  int64 
 12  crash_hour              526569 non-null  int64 
 13  crash_day               526569 non-null  object
 14  crash_month             526569 non-n