# Ensemble Model

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from typing import Union

from raw_to_transformed_data import get_sql_data

np.set_printoptions(suppress=True)

In [2]:
ModelRegressor = Union[
    LinearRegression, RandomForestRegressor, GradientBoostingRegressor]

In [3]:
def cv_regression_model(
        model: ModelRegressor, X: pd.DataFrame, y: pd.DataFrame, 
        scoring: str="neg_mean_squared_error", 
        cv: int=5) -> np.ndarray:
    """Perform cross validation on regression models."""
    rmses = cross_val_score(
        model, X, y, scoring=scoring, cv=cv)
    avg_rmse = np.mean(np.sqrt(-rmses))
    return avg_rmse, np.sqrt(-rmses)

In [4]:
# Get crashes data
query_crashes = """
SELECT *
FROM crashes_joined;
"""
dbname = "chi-traffic-accidents"
df_crashes = get_sql_data(dbname, query_crashes)

In [11]:
df_crashes

Unnamed: 0,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,street_direction,num_units,crash_hour,crash_day,crash_month,num_bikes_involved,num_pedestrians_involved,num_partially_ejected,num_extricated
0,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,3,Sunday,August,0.0,0.0,0.0,0.0
1,35,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,N,2,2,Sunday,August,0.0,0.0,0.0,0.0
2,25,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,S,2,2,Sunday,August,0.0,0.0,0.0,0.0
3,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,OTHER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,S,2,2,Sunday,August,0.0,0.0,0.0,0.0
4,30,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,W,3,2,Sunday,August,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526564,30,NO CONTROLS,NO CONTROLS,RAIN,DAYLIGHT,SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,WET,UNKNOWN,S,2,19,Wednesday,June,0.0,0.0,0.0,0.0
526565,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,WET,NO DEFECTS,N,2,19,Monday,February,0.0,0.0,0.0,0.0
526566,30,YIELD,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,NO DEFECTS,S,2,7,Tuesday,January,0.0,0.0,0.0,0.0
526567,30,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,W,2,18,Saturday,January,0.0,0.0,0.0,0.0


In [5]:
# Transforming df_crashes for preliminary model
drop_cols = ['crash_record_id', 'crash_date', 'report_type', 
    'prim_contributory_cause', 'intersection_related_i', 'hit_and_run_i', 
    'lane_cnt', 'has_injuries']
df_crashes = df_crashes.drop(columns=drop_cols)

df_crashes = df_crashes.rename(columns={"crash_day_of_week": "crash_day"})
df_crashes["street_direction"] = (
    df_crashes["street_direction"]
        .fillna(df_crashes["street_direction"].mode()[0]))

In [6]:
# Create X and y
y = df_crashes.pop("injuries_total")
X = df_crashes.copy()
# del df_crashes

In [7]:
# Transforming X and y for modeling
numeric_cols = ["posted_speed_limit", "num_units", "crash_hour", 
    'num_bikes_involved', 'num_extricated', 'num_partially_ejected', 
    'num_pedestrians_involved']
category_cols = X.columns.difference(numeric_cols)
encoder = OneHotEncoder(drop=None, sparse=True)
onehot_crashes = encoder.fit_transform(X[category_cols])
matrix_cols = []
for col, ele in zip(category_cols, encoder.categories_):
    for e in ele:
        matrix_cols.append(col + "_" + e.lower())
X = pd.concat(
    [X[numeric_cols], pd.DataFrame(
        onehot_crashes.toarray(), columns=matrix_cols)], 
    axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# Baseline model
model_dum = DummyRegressor(strategy="mean")
model_dum.fit(X_train, y_train)
y_pred = model_dum.predict(X_test)
rmse_dum = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE dum: {rmse_dum:.4f}")

RMSE dum: 0.5541


In [10]:
# Evaluate regression models
model_lr = LinearRegression()
model_rf = RandomForestRegressor(
    n_estimators=50,
    max_features="sqrt")
model_gb = GradientBoostingRegressor(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2)
models_all = [model_lr, model_rf, model_gb]

In [None]:
scores = []
score_lists = []
for model in models_all:
    score, lst = cv_regression_model(model, X_train, y_train)
    scores.append(score)
    score_lists.append(lst)
    print(f"RMSE for {model}: {score:.4f}")
    print(lst)