# 1. Initialization

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
crash_data_clean = pd.read_csv('Crash_Analysis_System_CAS_data_clean.csv', keep_default_na=False)

# 2. Preprocessing

Talk about no need for feature scaling or transformation, nor normalizaton
Why one hot encoding and not label encoding, why not ordinal labels and the need to have categorical features binarized

Explore and choose the right metric

In [3]:
def parse_type(dtype):
    if dtype == 'int':
        return np.int8
    elif dtype == 'float':
        return np.float
    else:
        return dtype

# Read features descriptions
features_catalog = pd.read_table('features_description.tsv')
# Make a dict to use as dtypes for panda's dataframe
features_dtypes = features_catalog.set_index('feature_name')['pandas_dtype'].apply(parse_type).to_dict()
# Keep only the columns that remain in the clean version of the dataframe
features_dtypes = {k: v for k, v in features_dtypes.items() if k in crash_data_clean.columns}

In [4]:
crash_data_clean = crash_data_clean.astype(features_dtypes, copy=False)

In [5]:
crash_data_clean['speedLimit'] = crash_data_clean['speedLimit'].apply(lambda x: 999 if x == -1 else x)

In [6]:
categorical_features = list(features_catalog[features_catalog['feature_type'] == 'categorical']['feature_name'])
categorical_features.remove('crashSeverity')
crash_data_ohe = pd.get_dummies(crash_data_clean,columns=categorical_features)

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             fbeta_score, make_scorer, classification_report)

def print_results(true, pred, betta=1):
    print('Accuracy score: ', format(accuracy_score(true, pred)))
    print('Precision score: ', format(precision_score(true, pred, average='weighted')))
    print('Recall score: ', format(recall_score(true, pred, average='weighted')))
    print('F1 score: ', format(f1_score(true, pred, average='weighted')))
    print('F betta score with betta=%.2f: ' % betta, format(fbeta_score(true, pred, betta, average='weighted')))
    print('\n', classification_report(y_test, predictions))

In [21]:
crash_data_ohe.drop(['fatalCount', 'seriousInjuryCount', 'minorInjuryCount'], axis=1, inplace=True)

In [22]:
y = crash_data_ohe['crashSeverity']
X = crash_data_ohe.drop('crashSeverity', axis=1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Benchmark

<span style="color:red">USE K-FOLD</span>

also try:

* Naive Bayes
* sklearn.ensemble.GradientBoostingClassifier
* XGBoost
* LGBM
* Random Forest

In [41]:
clf_multi = MultinomialNB()

clf_multi.fit(X_train, y_train)
predictions = clf_multi.predict(X_test)

In [50]:
print_results(y_test, predictions, betta=1.5)

Accuracy score:  0.6308358513852832
Precision score:  0.6907373501708025
Recall score:  0.6308358513852832
F1 score:  0.6425445046329308
F betta score with betta=1.50:  0.634944596397731

              precision    recall  f1-score   support

          F       0.03      0.49      0.06      1728
          M       0.53      0.20      0.30     42522
          N       0.79      0.81      0.80    133170
          S       0.14      0.13      0.14     10556

avg / total       0.69      0.63      0.64    187976



One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. 
It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!

Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not!

Thank you for learning with us!

HOW TO DO ROC/AUC?
HOW TO IMPROVE THESE METRICS?

Feature selection is not a must-do task for RF algorithm

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [39]:
print_results(y_test, predictions, betta=.5)

Accuracy score:  0.7150327701408691
Precision score:  0.6700715007746912
Recall score:  0.7150327701408691
F1 score:  0.6847577049402805
F betta score with betta=0.50:  0.6731883705029672

              precision    recall  f1-score   support

          F       0.12      0.03      0.05      1728
          M       0.45      0.33      0.38     42522
          N       0.78      0.90      0.84    133170
          S       0.26      0.08      0.13     10556

avg / total       0.67      0.72      0.68    187976



In [52]:
parameters = {
    'n_estimators': [10, 50, 100, 150],
    'criterion': ['gini'],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [1000, 5000, 10000]
}

# Prioritize Precision over Recall
scorer = make_scorer(fbeta_score, beta=.5, average='weighted')
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=3)
grid_fit = grid_obj.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, score=0.5870419102299391, total=   3.0s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, score=0.5985292727171957, total=   2.9s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.0s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, score=0.6013075486673535, total=   2.7s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, score=0.536305891666105, total=   5.1s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, score=0.5363126745792993, total=   5.6s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, score=0.5363234816116889, total=   4.8s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, score=0.536305891666105, total=   7.2s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, score=0.5363126745792993, total=   8.0s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, score=0.5363234816116889, total=   8.0s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150, score=0.536305891666105, total=  10.8s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150, score=0.5363126745792993, total=  12.5s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=150, score=0.5363234816116889, total=  10.8s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, score=0.5823666025225265, total=   2.7s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, score=0.5985066780414395, total=   2.9s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, score=0.6018394020076566, total=   2.8s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, score=0.536305891666105, total=   4.9s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, score=0.5363126745792993, total=   5.0s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, score=0.5363234816116889, total=   5.4s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, score=0.536305891666105, total=   7.6s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, score=0.5363126745792993, total=   7.2s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, score=0.5363234816116889, total=   7.4s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150, score=0.536305891666105, total=  11.8s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150, score=0.5363126745792993, total=  10.8s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=150, score=0.5363234816116889, total=  11.3s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, score=0.5980365438289004, total=   3.0s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, score=0.6018407902849742, total=   2.9s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, score=0.6045408305381198, total=   3.1s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, score=0.536305891666105, total=   6.4s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, score=0.5363126745792993, total=   5.0s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, score=0.5363234816116889, total=   5.0s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, score=0.536305891666105, total=   9.2s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, score=0.5363126745792993, total=   8.4s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, score=0.5363234816116889, total=   8.7s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150, score=0.536305891666105, total=  12.3s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150, score=0.5363126745792993, total=  14.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=150, score=0.5363234816116889, total=  11.8s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, score=0.6044715980506414, total=   3.2s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, score=0.603785175928484, total=   3.1s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, score=0.605516591564175, total=   3.3s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, score=0.5363121210808818, total=   6.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, score=0.536430782490342, total=   6.5s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, score=0.537698597210614, total=   6.8s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, score=0.5363090063554042, total=  11.9s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, score=0.5365746768322709, total=  10.5s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, score=0.5374324044691504, total=  10.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150, score=0.5363836079275043, total=  13.8s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150, score=0.5363872850025695, total=  14.3s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=150, score=0.5369055272134742, total=  14.7s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, score=0.6046106598116089, total=   3.4s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, score=0.5999942400487784, total=   3.0s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, score=0.6062959846484067, total=   3.0s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, score=0.5365450715322001, total=   6.9s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, score=0.5366139988021746, total=   6.9s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, score=0.5371972609994188, total=   6.2s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, score=0.5370726304638114, total=  10.0s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, score=0.5374308203390076, total=  10.6s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, score=0.5372813047184078, total=  11.0s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150, score=0.5368055685123198, total=  15.2s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150, score=0.5367266686336892, total=  15.2s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=150, score=0.5368302588941255, total=  14.0s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, score=0.590353059433513, total=   3.3s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, score=0.6037399613197448, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, score=0.6059855685588343, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, score=0.5364271009564905, total=   6.5s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, score=0.5371385025389865, total=   7.1s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, score=0.5367838881306409, total=   6.4s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, score=0.5366226224874057, total=   9.3s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, score=0.537477635332212, total=  10.3s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, score=0.5367817926187233, total=  10.7s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150, score=0.5365047303785907, total=  14.5s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150, score=0.5367328076457396, total=  14.6s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=150, score=0.5365161379744409, total=  13.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, score=0.604947024285798, total=   4.1s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, score=0.6080398713573341, total=   3.0s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, score=0.608252181710733, total=   3.1s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50, score=0.5399813342220078, total=   6.7s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50, score=0.5566612050741976, total=   7.6s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50, score=0.5766263822451644, total=   7.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100, score=0.5412536385521726, total=  12.5s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100, score=0.5727226781108177, total=  12.6s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100, score=0.5717077391464297, total=  13.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150, score=0.5504884985311462, total=  18.1s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150, score=0.5514556294228835, total=  16.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=150, score=0.5594703803221073, total=  18.3s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, score=0.6070653029330222, total=   3.8s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, score=0.6120189985886758, total=   3.5s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, score=0.6142404161740469, total=   3.6s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, score=0.5414539769530603, total=   6.8s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, score=0.5499426468312056, total=   8.7s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, score=0.5683795546640807, total=   7.9s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, score=0.5830143612908906, total=  13.0s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, score=0.5471702920452145, total=  12.4s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, score=0.5928721198798805, total=  13.5s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150, score=0.5534889802504288, total=  17.5s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150, score=0.540073080498898, total=  16.8s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=150, score=0.5649914691602415, total=  19.9s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, score=0.6069616957649365, total=   3.6s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, score=0.6059156096587774, total=   3.2s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, score=0.6078586237027079, total=   3.2s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, score=0.5438191846997584, total=   7.3s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, score=0.5471140316845812, total=   7.5s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, score=0.5746330285583018, total=   7.8s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, score=0.549574973930051, total=  13.2s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, score=0.5460735642172633, total=  12.3s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, score=0.5963565945762902, total=  11.6s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150, score=0.5424621032293642, total=  18.0s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150, score=0.5398342078869889, total=  16.3s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=150, score=0.5456867537108675, total=  16.3s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, score=0.6345738619507261, total=   3.6s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, score=0.6376832248353876, total=   3.6s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, score=0.6347928088796322, total=   3.3s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50, score=0.6358316716944526, total=   9.7s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50, score=0.6299898902424231, total=   8.4s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50, score=0.629304448058678, total=   8.5s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100, score=0.6373288783942658, total=  14.1s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100, score=0.6238383944705193, total=  15.6s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=100, score=0.6358991487653194, total=  14.8s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150, score=0.6416326820049789, total=  20.3s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150, score=0.6336052528183372, total=  22.5s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=150, score=0.6272344100622207, total=  22.6s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10, score=0.6380299796296747, total=   3.8s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10, score=0.6334748944896362, total=   3.8s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10, score=0.6307388508730661, total=   3.6s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, score=0.6370287120784448, total=   8.1s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, score=0.6224675630880077, total=   8.7s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, score=0.632308862430298, total=   8.2s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, score=0.6360675372023298, total=  13.9s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, score=0.6095512486198078, total=  15.1s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, score=0.6272404954581746, total=  14.7s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150, score=0.6352287810800203, total=  20.7s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150, score=0.6200905594924585, total=  19.1s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=150, score=0.6264341862542823, total=  21.9s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, score=0.6257116451191219, total=   4.4s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, score=0.6302424003204636, total=   3.2s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, score=0.6120126738118395, total=   3.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, score=0.6366525227626026, total=   8.0s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, score=0.6244502509145442, total=   8.2s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, score=0.6095384487086449, total=   8.4s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, score=0.6394633859607232, total=  13.6s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, score=0.6048969054097578, total=  14.3s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, score=0.6103457644348496, total=  14.0s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150, score=0.640796907957643, total=  19.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150, score=0.6074556019438168, total=  20.6s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=150, score=0.6184576679318341, total=  20.2s


[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 28.6min finished


In [54]:
best_clf = grid_fit.best_estimator_
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 1.5, average='weighted')))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 1.5, average='weighted')))

Unoptimized model
------
Accuracy score on testing data: 0.7150
F-score on testing data: 0.6950

Optimized Model
------
Final accuracy score on the testing data: 0.7240
Final F-score on the testing data: 0.6588


  'precision', 'predicted', average, warn_for)
