# __Model Building and Evaluation__

### Phase 3 Project - Chicago Traffic Crash Classification

#### Author: Ian Sharff

> This Notebook will most likely take up to two hours to load. This module fits each model and adds the result to `fitted_models/` as a PKL file to be used in the main notebook, `chicago_crashes.ipnyb`.

In [None]:
# Data manipulation and storage
import pandas as pd
import numpy as np
import pickle
import os

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('images/presentation.mplstyle')

# Preprocessing and Pipelines
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_union

# Models using SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Models employed
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Model validation and 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Custom model evaluation and fitting classes
from models import ModelEvaluator, TrainTestSplit

In [None]:
# Can be altered to add/remove features
BIN_FIELDS = ['INTERSECTION_RELATED_I',
              'HIT_AND_RUN_I',
              'WORK_ZONE_I']
CAT_FIELDS = ['TRAFFIC_CONTROL_DEVICE',
              'DEVICE_CONDITION',
              'WEATHER_CONDITION',
              'LIGHTING_CONDITION',
              'FIRST_CRASH_TYPE',
              'TRAFFICWAY_TYPE',
              'ALIGNMENT',
              'ROADWAY_SURFACE_COND',
              'ROAD_DEFECT',
              'CRASH_TYPE',
              'DAMAGE',
              'MOST_SEVERE_INJURY',
              'CRASH_HOUR',
              'CRASH_DAY_OF_WEEK',
              'CRASH_MONTH',
              'CRASH_YEAR']
NUM_FIELDS = ['POSTED_SPEED_LIMIT',
              'NUM_UNITS',
              'INJURIES_TOTAL',
              'INJURIES_FATAL',]

with open('data/binned_causes.pkl', 'rb') as f:
    BINNED_CAUSES = pickle.load(f)


In [None]:
crashes = pd.read_pickle('data/crashes.pkl')
crashes.info()

In [None]:
X = crashes.drop('PRIM_CONTRIBUTORY_CAUSE', axis=1)
y = crashes['PRIM_CONTRIBUTORY_CAUSE'].map(BINNED_CAUSES)

splits = TrainTestSplit(X, y)

In [None]:
print(splits.y_train.value_counts(normalize=True))
print()
print(splits.y_test.value_counts(normalize=True))

In [None]:
dummy = ModelEvaluator(
    splits,
    DummyClassifier(strategy='stratified'),
    'Dummy Classifier'
)

dummy.run_model()

In [None]:
dummy.train_test_classification_reports()

In [None]:
dummy.confusion_matrices()

In [None]:
dummy.pickle_fitted_model('fitted_models/dummy_classifier.pkl')

## Decision Tree Classifier

In [None]:
bin_imputer = SimpleImputer(strategy='constant', fill_value=False)
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='most_frequent')

ohe = OneHotEncoder(handle_unknown='ignore')

imputers = ColumnTransformer([
    ('binary', bin_imputer, BIN_FIELDS),
    ('categorical', cat_imputer, CAT_FIELDS),
    ('numeric', num_imputer, NUM_FIELDS)
])

decisiontree_model = DecisionTreeClassifier(
    max_depth=4,
)

decision_tree_pipeline = Pipeline([
    ('imputers', imputers),
    ('ohe', ohe),
    ('model', decisiontree_model)
])

In [None]:
decision_tree = ModelEvaluator(
    splits,
    decision_tree_pipeline,
    'Simple Decision Tree'
)

In [None]:
decision_tree.run_model()

In [None]:
decision_tree.train_test_classification_reports()

In [None]:
decision_tree.confusion_matrices()

In [None]:
decision_tree.pickle_fitted_model('fitted_models/decisiontree_simple.pkl')

In [None]:
with open('fitted_models/decisiontree_simple.pkl', 'rb') as f:
    decision_tree_fitted = pickle.load(f)

In [None]:
print(decision_tree_fitted)

## Simple Decision Tree Classifier with SMOTE

In [None]:
smote = SMOTE()

smote_pipeline = ImbPipeline(steps=[
    ('imputers', imputers),
    ('encoder', ohe),
    ('smote', smote),
    ('model', decisiontree_model)
])

In [None]:
decision_tree_smote = ModelEvaluator(
    splits,
    smote_pipeline,
    'SMOTE Decision Tree'
)

In [None]:
decision_tree_smote.run_model()

In [None]:
decision_tree_smote.train_test_classification_reports()

In [None]:
decision_tree_smote.confusion_matrices()

In [None]:
decision_tree_smote.pickle_fitted_model('fitted_models/decisiontree_smote.pkl')

## Gradient Boosting Classifier

In [None]:
smote = SMOTE()

gboost_model = GradientBoostingClassifier(
    learning_rate=0.3,
    n_estimators=100
)


gradient_boost_pipeline = ImbPipeline([
    ('imputers', imputers),
    ('encoder', ohe),
    ('smote', smote),
    ('model', gboost_model)
])

In [None]:
gradient_boost = ModelEvaluator(
    splits,
    gradient_boost_pipeline,
    "Gradient Boosting with SMOTE"
)

In [None]:
gradient_boost.run_model()

In [None]:
gradient_boost.train_test_classification_reports()

In [None]:
gradient_boost.confusion_matrices()

In [None]:
gradient_boost.pickle_fitted_model('fitted_models/gradient_boost_smote.pkl')

In [None]:
# Get most important features
gboost_steps = gradient_boost._fitted_model.named_steps

feat_importances = gboost_steps['model'].feature_importances_
feat_names = gboost_steps['encoder'].get_feature_names(BIN_FIELDS + CAT_FIELDS + NUM_FIELDS)

pd.Series(feat_importances, index=feat_names).sort_values(ascending=False).head(5)