## Two naive models

In [117]:
import pandas as pd
import numpy as np

In [118]:
from pathlib import Path

DATA_FOLDER = Path("./data")
AVG_CRASHES_DATASET = DATA_FOLDER / "crashes_avg.csv"

In [119]:
crashes = pd.read_csv(AVG_CRASHES_DATASET)

### Baseline model

This model will always return the average number of crashes per road per hour. (Not completely true: see final comment section in `nyc_accidents.ipynb`.)

In [120]:
from sklearn.model_selection import train_test_split

crashes_train, crashes_test = train_test_split(crashes, test_size=0.2, random_state=42)

In [121]:
from sklearn.base import BaseEstimator, RegressorMixin


class BaselineModel(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.pred = 0

    def fit(self, X, y):
        if isinstance(y, pd.Series):
            y = y.to_numpy()
        self.pred = np.mean(y)
        return self

    def predict(self, X):
        X_length = X.shape[0]
        return self.pred * np.ones((X_length,))

In [122]:
baseline = BaselineModel()
X = crashes_train.drop(columns=["crashes"])
y = crashes_train["crashes"]

In [123]:
from sklearn.model_selection import cross_val_score

baseline_scores = cross_val_score(
    baseline, X, y, scoring="neg_mean_squared_error", cv=5
)

In [124]:
baseline_scores.mean()

-0.0014339547702729436

In [125]:
import pickle

with open("./models/baseline.sav", "wb") as f:
    pickle.dump(baseline, f)

### A more involved model 

In [126]:
crashes_train.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1161216 entries, 791419 to 121958
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   rw_type     1161216 non-null  int64  
 1   bike_lane   188920 non-null   float64
 2   shape_leng  1161216 non-null  float64
 3   trafdir     1161135 non-null  object 
 4   st_width    1161216 non-null  float64
 5   postvz_sl   1161216 non-null  float64
 6   humps       1161216 non-null  float64
 7   hour        1142505 non-null  float64
 8   crashes     1161216 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 88.6+ MB


In [127]:
crashes_train.head()

Unnamed: 0,rw_type,bike_lane,shape_leng,trafdir,st_width,postvz_sl,humps,hour,crashes
791419,1,,322.553731,TF,52.0,25.0,0.0,15.0,0.021127
307584,1,,167.160607,TW,30.0,25.0,0.0,16.0,0.014085
403161,1,,241.090685,FT,50.0,25.0,0.0,18.0,0.098592
664856,1,,270.99118,TW,30.0,25.0,0.0,13.0,0.028169
1031546,1,,535.658697,FT,32.0,25.0,0.0,19.0,0.035211


In [128]:
crashes_train["hour"] = crashes_train.hour.fillna(16)
crashes_train["humps"] = crashes_train.humps.fillna(0)

In [129]:
crashes_train_oh = pd.get_dummies(
    crashes_train, columns=["rw_type", "bike_lane", "trafdir"]
)

In [130]:
crashes_train_oh.head()

Unnamed: 0,shape_leng,st_width,postvz_sl,humps,hour,crashes,rw_type_1,rw_type_2,rw_type_3,rw_type_4,...,bike_lane_3.0,bike_lane_4.0,bike_lane_5.0,bike_lane_6.0,bike_lane_8.0,bike_lane_9.0,trafdir_FT,trafdir_NV,trafdir_TF,trafdir_TW
791419,322.553731,52.0,25.0,0.0,15.0,0.021127,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
307584,167.160607,30.0,25.0,0.0,16.0,0.014085,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
403161,241.090685,50.0,25.0,0.0,18.0,0.098592,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
664856,270.99118,30.0,25.0,0.0,13.0,0.028169,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1031546,535.658697,32.0,25.0,0.0,19.0,0.035211,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [131]:
X = crashes_train_oh.drop(columns=["crashes"])
y = crashes_train_oh["crashes"]

In [132]:
X.shape

(1161216, 30)

In [133]:
X.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1161216 entries, 791419 to 121958
Data columns (total 30 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   shape_leng     1161216 non-null  float64
 1   st_width       1161216 non-null  float64
 2   postvz_sl      1161216 non-null  float64
 3   humps          1161216 non-null  float64
 4   hour           1161216 non-null  float64
 5   rw_type_1      1161216 non-null  bool   
 6   rw_type_2      1161216 non-null  bool   
 7   rw_type_3      1161216 non-null  bool   
 8   rw_type_4      1161216 non-null  bool   
 9   rw_type_5      1161216 non-null  bool   
 10  rw_type_6      1161216 non-null  bool   
 11  rw_type_7      1161216 non-null  bool   
 12  rw_type_8      1161216 non-null  bool   
 13  rw_type_9      1161216 non-null  bool   
 14  rw_type_10     1161216 non-null  bool   
 15  rw_type_12     1161216 non-null  bool   
 16  rw_type_13     1161216 non-null  bool   
 17  rw_type_1

In [134]:
X.humps.value_counts()

humps
0.0    1108347
1.0      34128
2.0      15101
3.0       2244
4.0        912
9.0        139
5.0        122
8.0        114
6.0        109
Name: count, dtype: int64

In [135]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

In [136]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

In [137]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(linear_model, X_std, y, scoring="neg_mean_squared_error", cv=5)

In [139]:
np.mean(scores)

-0.001303085436349189

## Full model

In [140]:
one_hot_encoding_cols = list(
    map(crashes.columns.get_loc, ["rw_type", "bike_lane", "trafdir"])
)
bike_lane_ix = crashes.columns.get_loc("bike_lane")
hour_ix = crashes.columns.get_loc("hour")
traf_dir_ix = crashes.columns.get_loc("trafdir")
humps_ix = crashes.columns.get_loc("humps")
trafs = list(crashes.trafdir.unique())[:-1]

In [141]:
types = list(crashes.rw_type.unique())
types.sort()

In [142]:
bike_lanes = list(crashes.bike_lane.unique())[:-1]

In [143]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

col_transf = ColumnTransformer(
    [
        (
            "hour_imp",
            SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=16),
            [hour_ix],
        ),
        (
            "humps_imp",
            SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0),
            [humps_ix],
        ),
        (
            "one_hot_enc_bike",
            OneHotEncoder(
                categories=[types, bike_lanes],
                handle_unknown="ignore",
                sparse_output=False,
            ),
            one_hot_encoding_cols[:-1],
        ),
        (
            "one_hot_enc_traf",
            OneHotEncoder(
                categories=[trafs], handle_unknown="ignore", sparse_output=False
            ),
            [one_hot_encoding_cols[-1]],
        ),
    ],
    remainder="passthrough",
)

In [144]:
crashes_cp = crashes.copy()

In [145]:
crashes_cp.shape

(1451521, 9)

In [146]:
test_df = col_transf.fit_transform(crashes_cp.drop(columns=["crashes"]))

In [147]:
test_df.shape

(1451521, 30)

In [148]:
pipeline = Pipeline(
    [
        ("col_transfomer", col_transf),
        ("scaler", StandardScaler()),
        ("lin_model", LinearRegression()),
    ]
)

In [149]:
crashes_train, crashes_test = train_test_split(crashes, test_size=0.2, random_state=42)
X = crashes_train.drop(columns=["crashes"])
y = crashes_train["crashes"]

pipeline.fit(X, y)

In [151]:
with open("./models/mwe.sav", "wb") as f:
    pickle.dump(pipeline, f)