# 1. Initialization

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
crash_data_clean = pd.read_csv('Crash_Analysis_System_CAS_data_clean.csv', keep_default_na=False)

# 2. Preprocessing

Talk about no need for feature scaling or transformation, nor normalizaton
Why one hot encoding and not label encoding, why not ordinal labels and the need to have categorical features binarized

Explore and choose the right metric

In [3]:
def parse_type(dtype):
    if dtype == 'int':
        return np.int8
    elif dtype == 'float':
        return np.float
    else:
        return dtype

# Read features descriptions
features_catalog = pd.read_table('features_description.tsv')
# Make a dict to use as dtypes for panda's dataframe
features_dtypes = features_catalog.set_index('feature_name')['pandas_dtype'].apply(parse_type).to_dict()
# Keep only the columns that remain in the clean version of the dataframe
features_dtypes = {k: v for k, v in features_dtypes.items() if k in crash_data_clean.columns}

In [4]:
crash_data_clean = crash_data_clean.astype(features_dtypes, copy=False)

In [5]:
crash_data_clean['speedLimit'] = crash_data_clean['speedLimit'].apply(lambda x: 999 if x == -1 else x)

In [6]:
categorical_features = list(features_catalog[features_catalog['feature_type'] == 'categorical']['feature_name'])
categorical_features.remove('crashSeverity')
crash_data_ohe = pd.get_dummies(crash_data_clean,columns=categorical_features)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer

def print_results(pred, true, betta=1):
    print('Accuracy score: ', format(accuracy_score(true, pred)))
    print('Precision score: ', format(precision_score(true, pred, average='weighted')))
    print('Recall score: ', format(recall_score(true, pred, average='weighted')))
    print('F1 score: ', format(f1_score(true, pred, average='weighted')))
    print('F betta score with betta=%.2f: ' % betta, format(fbeta_score(true, pred, betta, average='weighted')))

In [8]:
y = crash_data_ohe['crashSeverity']
X = crash_data_ohe.drop('crashSeverity', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)

# 3. Benchmark

<span style="color:red">USE K-FOLD</span>

also try:

* Naive Bayes
* sklearn.ensemble.GradientBoostingClassifier
* XGBoost
* LGBM
* Bagging
* Random Forest

In [17]:
clf_multi = MultinomialNB()

clf_multi.fit(X_train, y_train)
predictions = clf_multi.predict(X_test)
print_results(predictions, y_test)

Accuracy score:  0.8777929097331574
Precision score:  0.9670136987250405
Recall score:  0.8777929097331574
F1 score:  0.9157455566035368
F betta score with betta=0.50:  0.9455543788431858


One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. 
It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!

Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not!

Thank you for learning with us!

HOW TO DO ROC/AUC?
HOW TO IMPROVE THESE METRICS?

Feature selection is not a must-do task for RF algorithm

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [39]:
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [40]:
print_results(predictions, y_test)

Accuracy score:  0.7238078265310465


  'precision', 'predicted', average, warn_for)


Precision score:  0.788243065957839
Recall score:  0.7238078265310465


  'precision', 'predicted', average, warn_for)


F1 score:  0.620909307817506
F betta score with betta=1.00:  0.620909307817506


In [22]:
print_results(predictions, y_test, betta=.3)

Accuracy score:  0.9938688981572116
Precision score:  0.993865255955091
Recall score:  0.9938688981572116
F1 score:  0.9936996212249498
F betta score with betta=0.30:  0.9938080634629914


In [23]:
print_results(predictions, y_test, betta=.01)

Accuracy score:  0.9938688981572116
Precision score:  0.993865255955091
Recall score:  0.9938688981572116
F1 score:  0.9936996212249498
F betta score with betta=0.01:  0.993865178381653


In [42]:
parameters = {
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini'],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [1000, 5000, 10000]
}

# Prioritize Precision over Recall
scorer = make_scorer(fbeta_score, beta=.5, average='weighted')
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, total=   1.5s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, total=   2.5s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, total=   2.6s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=50, total=   2.7s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, total=   4.1s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, total=   3.9s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=100, total=   4.0s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200, total=   6.2s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200, total=   6.2s
[CV] criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=1000, n_estimators=200, total=   6.4s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, total=   1.5s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=10, total=   1.5s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, total=   2.5s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, total=   2.6s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=50, total=   2.7s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, total=   4.0s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, total=   3.8s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=100, total=   4.0s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200, total=   7.3s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200, total=   6.9s
[CV] criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=5000, n_estimators=200, total=   6.2s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, total=   1.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=10, total=   1.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, total=   2.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, total=   2.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=50, total=   2.5s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, total=   3.6s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, total=   3.7s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=100, total=   3.7s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200, total=   6.0s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200, total=   6.0s
[CV] criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=3, min_samples_split=10000, n_estimators=200, total=   6.1s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, total=   2.8s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=50, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, total=   4.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, total=   4.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=100, total=   4.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200, total=   7.7s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200, total=   7.6s
[CV] criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=1000, n_estimators=200, total=   7.7s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=50, total=   2.9s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, total=   4.4s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, total=   4.3s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=100, total=   4.6s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200, total=   7.6s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200, total=   8.0s
[CV] criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=5000, n_estimators=200, total=   8.2s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, total=   2.8s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, total=   2.8s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=50, total=   2.8s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, total=   4.8s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, total=   4.4s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=100, total=   4.6s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200, total=   8.2s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200, total=   7.3s
[CV] criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=4, min_samples_split=10000, n_estimators=200, total=   7.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50, total=   3.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50, total=   3.6s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=50 
[CV]  cr

  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100, total=   5.5s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=100, total=   5.5s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200 
[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200, total=   9.4s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200, total=  10.0s
[CV] criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=1000, n_estimators=200, total=  10.1s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, total=   3.1s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, total=   3.3s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=50, total=   3.4s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, total=   5.6s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, total=   5.1s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=100, total=   5.1s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200, total=   8.8s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200, total=   9.0s
[CV] criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=5000, n_estimators=200, total=   8.9s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, total=   1.6s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, total=   3.1s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, total=   3.1s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=50, total=   3.3s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, total=   5.5s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, total=   5.4s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=100, total=   5.4s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200, total=   8.5s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200, total=   9.3s
[CV] criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=5, min_samples_split=10000, n_estimators=200, total=   9.2s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, total=   1.9s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, total=   1.9s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10 
[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=10, total=   1.7s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 
[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50, total=   3.7s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 
[CV]  criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50, total=   3.9s
[CV] criterion=gini, max_depth=6, min_samples_split=1000, n_estimators=50 
[CV]  cr

  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=10, total=   1.9s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, total=   3.5s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, total=   3.9s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=50, total=   3.6s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, total=   5.8s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, total=   6.2s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=100, total=   6.1s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200, total=  10.2s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200, total=  10.4s
[CV] criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200 
[CV]  criterion=gini, max_depth=6, min_samples_split=5000, n_estimators=200, total=  10.5s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 
[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=10, total=   1.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, total=   3.6s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 
[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, total=   3.4s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=50, total=   3.4s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, total=   5.7s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, total=   5.7s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=100, total=   5.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200 
[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200, total=  10.8s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200, total=  13.4s
[CV] criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200 


  'precision', 'predicted', average, warn_for)


[CV]  criterion=gini, max_depth=6, min_samples_split=10000, n_estimators=200, total=  11.3s


[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 14.2min finished


In [45]:
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5, average='weighted')))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5, average='weighted')))

Unoptimized model
------
Accuracy score on testing data: 0.9977
F-score on testing data: 0.9977

Optimized Model
------
Final accuracy score on the testing data: 0.8148
Final F-score on the testing data: 0.7849


In [48]:
best_clf.feature_importances_

array([  1.42746000e-02,   2.13208733e-01,   4.45237137e-01,
         8.53520685e-04,   1.84887388e-02,   0.00000000e+00,
         0.00000000e+00,   2.88326861e-04,   8.17423594e-06,
         0.00000000e+00,   1.15999470e-03,   7.71466343e-06,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         8.58443916e-06,   1.01737149e-05,   6.74736488e-03,
         9.35962151e-05,   2.06503786e-04,   0.00000000e+00,
         0.00000000e+00,   2.48923581e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   3.39804239e-03,
         0.00000000e+00,   0.00000000e+00,   1.50008994e-02,
         1.30864159e-04,   1.59164004e-02,   1.38686388e-02,
         4.55141225e-02,   0.00000000e+00,   0.00000000e+00,
         1.04655847e-04,   0.00000000e+00,   1.20250295e-04,
         0.00000000e+00,   2.93796645e-06,   6.10729899e-02,
         1.65533978e-04,   2.52881282e-06,   0.00000000e+00,
         3.25641327e-05,   0.00000000e+00,   0.00000000e+00,
         2.97637083e-06,

In [49]:
clf.feature_importances_

array([  2.30352674e-02,   1.48421941e-01,   5.73142044e-01,
         4.76578041e-03,   4.70844331e-03,   3.26610361e-05,
         3.19495705e-04,   1.34730655e-03,   1.44309986e-04,
         1.16275592e-03,   2.53370490e-03,   7.32671073e-04,
         4.94296537e-04,   4.63309175e-04,   4.03580541e-05,
         4.87240546e-04,   7.30509071e-04,   5.77027030e-03,
         3.22440344e-04,   1.59371408e-03,   6.28821930e-05,
         5.94970672e-05,   4.73305505e-04,   3.70341481e-04,
         7.57490131e-04,   4.70371981e-05,   1.70094583e-03,
         4.22935319e-04,   3.44624046e-04,   4.56989865e-03,
         5.24353002e-04,   1.05808605e-02,   3.48342005e-03,
         1.51156644e-02,   1.85959605e-04,   7.89808553e-05,
         2.26778644e-03,   3.68128921e-04,   1.89990345e-03,
         0.00000000e+00,   2.87172464e-03,   1.23733475e-02,
         8.07102101e-04,   6.95915856e-04,   9.68780860e-04,
         1.07454169e-04,   6.21274738e-05,   1.45462712e-04,
         6.10239859e-05,