# NYC Motor Vehicle Crashes
## Data preprocessing

In [1]:
import re

import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport

from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
file_size = 10000
filename = f"data/data_{file_size}_out.csv"

dtypes = {"crash_date": str,
          "crash_time": str,
          "borough": str,
          "zip_code": str,
          "latitude": float, 
          "longitude": float, 
          "on_street_name": str,
          "off_street_name": str,
          "number_of_persons_injured": int,
          "number_of_persons_killed": int,
          "number_of_pedestrians_injured": int,
          "number_of_pedestrians_killed": int,
          "number_of_cyclist_injured": int,
          "number_of_cyclist_killed": int,
          "number_of_motorist_injured": int,
          "number_of_motorist_killed": int,
          "contributing_factor_vehicle_1": str,
          "contributing_factor_vehicle_2": str,
          "contributing_factor_vehicle_3": str,
          "contributing_factor_vehicle_4": str,
          "contributing_factor_vehicle_5": str,
          "vehicle_type_code1": str,
          "vehicle_type_code2": str,
          "vehicle_type_code_3": str,
          "vehicle_type_code_4": str,
          "vehicle_type_code_5": str}

nyc_mvc = pd.read_csv(filename, dtype=dtypes)

In [4]:
nyc_mvc.dtypes

crash_date                        object
crash_time                        object
borough                           object
zip_code                          object
latitude                         float64
longitude                        float64
on_street_name                    object
off_street_name                   object
number_of_persons_injured          int64
number_of_persons_killed           int64
number_of_pedestrians_injured      int64
number_of_pedestrians_killed       int64
number_of_cyclist_injured          int64
number_of_cyclist_killed           int64
number_of_motorist_injured         int64
number_of_motorist_killed          int64
contributing_factor_vehicle_1     object
contributing_factor_vehicle_2     object
contributing_factor_vehicle_3     object
contributing_factor_vehicle_4     object
contributing_factor_vehicle_5     object
vehicle_type_code_1               object
vehicle_type_code_2               object
vehicle_type_code_3               object
vehicle_type_cod

### Features slicing

In [5]:
# Convert date and time
nyc_mvc.crash_date = pd.to_datetime(nyc_mvc.crash_date, format='%Y-%m-%d')
nyc_mvc.crash_time = pd.to_datetime(nyc_mvc.crash_time, format='%H:%M:%S').dt.time

# Date -> Day Month Year DayOfWeek
nyc_mvc = pd.concat([nyc_mvc, nyc_mvc.crash_date.apply(lambda x : pd.Series({"year": x.year, 
                                                                             "month": x.month, 
                                                                             "day": x.day,
                                                                             "day_of_week": x.day,
                                                                             "week": x.week
                                                                            }))], axis=1)

# Time -> Hour & Minute
nyc_mvc = pd.concat([nyc_mvc, nyc_mvc.crash_time.apply(lambda x : pd.Series({"hour": x.hour, 
                                                                             "minute": x.minute
                                                                            }))], axis=1)

In [6]:
# Number of persons killed 
nyc_mvc.loc[nyc_mvc.number_of_persons_killed>0,"persons_killed"] = 1
nyc_mvc.persons_killed.fillna(0, inplace=True)

# Number of persons injured
nyc_mvc.loc[nyc_mvc.number_of_persons_injured>0,"persons_injured"] = 1
nyc_mvc.persons_injured.fillna(0, inplace=True)

# Number of pedestrians killed
nyc_mvc.loc[nyc_mvc.number_of_pedestrians_killed>0,"pedestrians_killed"] = 1
nyc_mvc.pedestrians_killed.fillna(0, inplace=True)

# Number of pedestrians injured
nyc_mvc.loc[nyc_mvc.number_of_pedestrians_injured>0,"pedestrians_injured"] = 1
nyc_mvc.pedestrians_injured.fillna(0, inplace=True)

# Number of cyclist killed
nyc_mvc.loc[nyc_mvc.number_of_cyclist_killed>0,"cyclist_killed"] = 1
nyc_mvc.cyclist_killed.fillna(0, inplace=True)

# Number of cyclist injured
nyc_mvc.loc[nyc_mvc.number_of_cyclist_injured>0,"cyclist_injured"] = 1
nyc_mvc.cyclist_injured.fillna(0, inplace=True)

# Number of motorist killed
nyc_mvc.loc[nyc_mvc.number_of_motorist_killed>0,"motorist_killed"] = 1
nyc_mvc.motorist_killed.fillna(0, inplace=True)

# Number of motorist injured
nyc_mvc.loc[nyc_mvc.number_of_motorist_injured>0,"motorist_injured"] = 1
nyc_mvc.motorist_injured.fillna(0, inplace=True)

### Dummies

In [7]:
dummies = pd.get_dummies(nyc_mvc[["borough",
                                  "contributing_factor_vehicle_1",
                                  "contributing_factor_vehicle_2",
                                  "contributing_factor_vehicle_3",
                                  "contributing_factor_vehicle_4",
                                  "contributing_factor_vehicle_5",
                                  "vehicle_type_code_1", 
                                  "vehicle_type_code_2", 
                                  "vehicle_type_code_3",
                                  "vehicle_type_code_4",
                                  "vehicle_type_code_5"]], prefix='', prefix_sep='').max(level=0, axis=1)

nyc_mvc = pd.concat([nyc_mvc, dummies], axis=1)


## Splitting

In [8]:
# Remove non essential columns
nyc_mvc.drop(columns=["borough",
                      "contributing_factor_vehicle_1",
                      "contributing_factor_vehicle_2",
                      "contributing_factor_vehicle_3",
                      "contributing_factor_vehicle_4",
                      "contributing_factor_vehicle_5",
                      "vehicle_type_code_1", 
                      "vehicle_type_code_2", 
                      "vehicle_type_code_3",
                      "vehicle_type_code_4",
                      "vehicle_type_code_5",
                      "Unknown", 
                      "Unspecified",
                      "number_of_persons_injured",
                      "number_of_persons_killed",
                      "number_of_pedestrians_injured",
                      "number_of_pedestrians_killed",
                      "number_of_cyclist_injured",
                      "number_of_cyclist_killed",
                      "number_of_motorist_injured",
                      "number_of_motorist_killed",
                      "crash_date",
                      "crash_time"
                     ],
             inplace=True)

In [9]:
# Define the target
target = nyc_mvc.persons_killed.astype(int)

# Remove the target columns
nyc_mvc.drop(columns=["persons_killed",
                      "persons_injured",
                      "persons_killed",
                      "persons_injured",
                      "pedestrians_killed",
                      "pedestrians_injured",
                      "cyclist_killed",
                      "cyclist_injured",
                      "motorist_killed",
                      "motorist_injured"],
             inplace=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(nyc_mvc, target, random_state=41, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7204, 79)
(1802, 79)
(7204,)
(1802,)


In [11]:
prof = ProfileReport(nyc_mvc)
prof.to_file(output_file=f"profiling/profile_report_{file_size}_final.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=93.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Features standardization

In [12]:
# Standardize training and testing set using the mean and standard deviation from the training set
ct = ColumnTransformer([('std1', StandardScaler(), ["zip_code", "latitude", "longitude", 
                                                    "year", "month", "day", "week", "day_of_week",
                                                    "hour", "minute"])
                       ], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [13]:
X_train

array([[0.7654518286896713, -1.303392249981017, -1.3056336332641936, ...,
        0, 0, 0],
       [0.7664235155858917, -0.5999027815107898, 0.06629377487406665,
        ..., 0, 0, 0],
       [0.6156177092924978, 1.3715817215708983, 0.04878058425554455, ...,
        0, 0, 0],
       ...,
       [0.8056796661931926, -0.4522891597562777, 1.3946936071627607, ...,
        0, 0, 0],
       [-1.4167626028419253, -0.3885822282622054, -1.0299833100918168,
        ..., 0, 0, 0],
       [-1.4167626028419253, -0.014109777284855851, 1.2292182796935591,
        ..., 0, 0, 0]], dtype=object)

In [14]:
X_test

array([[0.721920255739001, 0.1459344652489398, 2.565693638767626, ..., 0,
        0, 0],
       [0.5324413109760384, 0.7948009174099853, -0.37530619134608373,
        ..., 0, 0, 0],
       [-1.4167626028419253, 1.3619838480268862, 0.5019343915091126, ...,
        0, 0, 0],
       ...,
       [-1.4167626028419253, 0.5028606168954345, 1.4753272556354244, ...,
        0, 0, 0],
       [0.7936307486800607, 0.22012375676561333, 0.3619504859404423, ...,
        0, 0, 0],
       [0.7637027922764748, -1.1784645111487462, -1.1098872422884782,
        ..., 0, 0, 0]], dtype=object)