# 1. Initialization

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
crash_data_clean = pd.read_csv('Crash_Analysis_System_CAS_data_clean.csv', keep_default_na=False)

# 2. Preprocessing

Talk about no need for feature scaling or transformation, nor normalizaton
Why one hot encoding and not label encoding, why not ordinal labels and the need to have categorical features binarized

Explore and choose the right metric

In [3]:
def parse_type(dtype):
    if dtype == 'int':
        return np.int8
    elif dtype == 'float':
        return np.float
    else:
        return dtype

# Read features descriptions
features_catalog = pd.read_table('features_description.tsv')
# Make a dict to use as dtypes for panda's dataframe
features_dtypes = features_catalog.set_index('feature_name')['pandas_dtype'].apply(parse_type).to_dict()
# Keep only the columns that remain in the clean version of the dataframe
features_dtypes = {k: v for k, v in features_dtypes.items() if k in crash_data_clean.columns}

In [4]:
crash_data_clean = crash_data_clean.astype(features_dtypes, copy=False)

In [5]:
crash_data_clean['speedLimit'] = crash_data_clean['speedLimit'].apply(lambda x: 999 if x == -1 else x)

In [6]:
categorical_features = list(features_catalog[features_catalog['feature_type'] == 'categorical']['feature_name'])
categorical_features.remove('crashSeverity')
crash_data_ohe = pd.get_dummies(crash_data_clean,columns=categorical_features)

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer

def print_results(true, pred, betta=1):
    print('Accuracy score: ', format(accuracy_score(true, pred)))
    print('Precision score: ', format(precision_score(true, pred, average='weighted')))
    print('Recall score: ', format(recall_score(true, pred, average='weighted')))
    print('F1 score: ', format(f1_score(true, pred, average='weighted')))
    print('F betta score with betta=%.2f: ' % betta, format(fbeta_score(true, pred, betta, average='weighted')))

In [8]:
y = crash_data_ohe['crashSeverity']
X = crash_data_ohe.drop('crashSeverity', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)

# 3. Benchmark

<span style="color:red">USE K-FOLD</span>

also try:

* Naive Bayes
* sklearn.ensemble.GradientBoostingClassifier
* XGBoost
* LGBM
* Random Forest

In [11]:
clf_multi = MultinomialNB()

clf_multi.fit(X_train, y_train)
predictions = clf_multi.predict(X_test)
print_results(y_test, predictions, betta=.5)

Accuracy score:  0.8777929097331574
Precision score:  0.9670136987250405
Recall score:  0.8777929097331574
F1 score:  0.9157455566035368
F betta score with betta=0.50:  0.9455543788431858


One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. 
It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!

Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not!

Thank you for learning with us!

HOW TO DO ROC/AUC?
HOW TO IMPROVE THESE METRICS?

Feature selection is not a must-do task for RF algorithm

In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [13]:
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [14]:
clf.score(X_test, y_test)

0.99769917436268463

In [15]:
print_results(y_test, predictions)

Accuracy score:  0.9976991743626846
Precision score:  0.9976951594931124
Recall score:  0.9976991743626846
F1 score:  0.9976779446393178
F betta score with betta=1.00:  0.9976779446393178


In [None]:
parameters = {
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini'],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [1000, 5000, 10000]
}

# Prioritize Precision over Recall
scorer = make_scorer(fbeta_score, beta=.5, average='weighted')
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)

In [None]:
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5, average='weighted')))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5, average='weighted')))

In [None]:
best_clf.feature_importances_

In [None]:
clf.feature_importances_