# Model Training
I will attempt to train various models with the cleaned data here.

In [48]:
# Standard Imports
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score

In [18]:
# Load the data
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv').squeeze()
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv').squeeze()

In [60]:
def print_stats(clf):
    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_test)
    y_probs = clf.predict_proba(X_test)
    return f'Score: {clf.score(X_test, y_test)}', f'AUC_ROC: {roc_auc_score(y_test, y_preds)}'

## Originally Planned Models

### Logistic Regression

In [76]:
from sklearn.linear_model import LogisticRegression

In [77]:
# clf = LogisticRegression(max_iter=100000, solver='lbfgs') # Raises ConvergenceWarning
# print_stats(clf)

In [78]:
clf = LogisticRegression(max_iter=100000, solver='liblinear')
print_stats(clf)

('Score: 0.8695652173913043', 'AUC_ROC: 0.5708893606982779')

In [79]:
clf = LogisticRegression(max_iter=100000, solver='saga')
print_stats(clf)

('Score: 0.8532608695652174', 'AUC_ROC: 0.5')

### Gradient Boosting

In [80]:
from sklearn.ensemble import GradientBoostingClassifier

In [81]:
clf = GradientBoostingClassifier()
print_stats(clf)

('Score: 0.8777173913043478', 'AUC_ROC: 0.6446685539042227')

### Random Forests

In [82]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
print_stats(clf)

('Score: 0.8777173913043478', 'AUC_ROC: 0.6216678461901393')

### Neural Network

In [83]:
from sklearn.neural_network import MLPClassifier

In [84]:
clf = MLPClassifier()
print_stats(clf)

('Score: 0.8478260869565217', 'AUC_ROC: 0.5198159943382873')

## Additional Exploratory Models

### Histogram-based Gradient Boosting Classification Tree

In [88]:
from  sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier()
print_stats(clf)

('Score: 0.8722826086956522', 'AUC_ROC: 0.6261500353857041')

### Naive Bayes

#### GaussianNB

In [94]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print_stats(clf)

('Score: 0.7717391304347826', 'AUC_ROC: 0.7052370842179759')

#### MultinomialNB

In [95]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
print_stats(clf)

('Score: 0.5244565217391305', 'AUC_ROC: 0.5296650153338052')

#### BernoulliNB

In [98]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
print_stats(clf)

('Score: 0.875', 'AUC_ROC: 0.6430761972163246')

### AdaBoostClassifier

In [112]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(algorithm='SAMME')
print_stats(clf)

('Score: 0.8722826086956522', 'AUC_ROC: 0.6184831328143431')

### BaggingClassifier

In [106]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier()
print_stats(clf)

('Score: 0.845108695652174', 'AUC_ROC: 0.5948926633640009')

### StackingClassifier
I will try using multiple of the higher-scoring estimator algorithms here.

In [111]:
from sklearn.ensemble import StackingClassifier
estimators = [
    ('rf', RandomForestClassifier()),
    ('gnb', GaussianNB()),
    ('hgb', HistGradientBoostingClassifier()),
    ('ab', AdaBoostClassifier(algorithm='SAMME'))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=GaussianNB()
)
print_stats(clf)

('Score: 0.8559782608695652', 'AUC_ROC: 0.7085987261146498')

Wow! It looks like we achieved both of our goal metrics without any hyperparameter tuning! Let's see if we can improve this score using the same StackingClassifier method.

## Tuning Hyperparameters

In [114]:
from sklearn.model_selection import GridSearchCV

### Random Forest Optimization

In [218]:
# Define a grid of hyperparameters
rf_grid = {'n_estimators': list(range(10, 200, 20)),
           # 'max_depth': [None] + list(range(1, 10, 3)), # Default is best
           # 'max_features': (['sqrt'] + list(range(1, 20, 4))), # Default is best
           'min_samples_split': list(range(2, 20, 4)),
           'min_samples_leaf': list(range(1, 10, 3))}
rf = RandomForestClassifier()

# Find the best parameters
gs_clf = GridSearchCV(estimator=rf,
                      param_grid=rf_grid,
                      n_jobs=-1,
                      cv=50, # Higher value due to randomized nature of this algorithm
                      verbose=3,
                      scoring='roc_auc',
                      return_train_score=True)
gs_clf.fit(X_train, y_train)
print(gs_clf.best_params_)
y_preds = gs_clf.predict(X_test)
y_probs = gs_clf.predict_proba(X_test)

# Set the estimator to the best parameters
rf = RandomForestClassifier(**gs_clf.best_params_)

# Show one result
print_stats(rf)

Fitting 50 folds for each of 150 candidates, totalling 7500 fits
{'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 110}


('Score: 0.7848549186128804', 'AUC_ROC: 0.5938900684123615')

### Gaussian Naive-Bayes Optimization

In [176]:
gnb = GaussianNB()
gnb.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [248]:
# Define a grid of hyperparameters
gnb_grid = {'var_smoothing': np.linspace(1e-09, 1e-25, 15)}
gnb = GaussianNB()

# Find the best parameters
gs_clf = GridSearchCV(estimator=gnb,
                      param_grid=gnb_grid,
                      n_jobs=-1,
                      cv=5,
                      verbose=3,
                      scoring='roc_auc',
                      return_train_score=True)
gs_clf.fit(X_train, y_train)
print(gs_clf.best_params_)

# Set the estimator to the best parameters
gnb = GaussianNB(**gs_clf.best_params_)

# Show one result
print_stats(gnb)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'var_smoothing': 1e-25}


('Score: 0.7771739130434783', 'AUC_ROC: 0.7160887001651334')

### HistGradientBoostingClassifier Optimization

In [245]:
hgb = HistGradientBoostingClassifier()
hgb.get_params()

{'categorical_features': 'warn',
 'class_weight': None,
 'early_stopping': 'auto',
 'interaction_cst': None,
 'l2_regularization': 0.0,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_bins': 255,
 'max_depth': None,
 'max_features': 1.0,
 'max_iter': 100,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'monotonic_cst': None,
 'n_iter_no_change': 10,
 'random_state': None,
 'scoring': 'loss',
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [252]:
# Define a grid of hyperparameters
hgb_grid = {
    'learning_rate': np.linspace(0.1, 1, 4),
    'max_iter': list(range(40, 161, 20)),
    'max_leaf_nodes': [None] + list(range(2, 50, 10)),
    'max_features': np.linspace(0.5, 1, 3),
    'l2_regularization': np.linspace(0, 20, 4)
}
hgb = HistGradientBoostingClassifier()

# Find the best parameters
gs_clf = GridSearchCV(estimator=hgb,
                      param_grid=hgb_grid,
                      n_jobs=-1,
                      cv=3,
                      verbose=3,
                      scoring='roc_auc',
                      return_train_score=True)
gs_clf.fit(X_train, y_train)
print(gs_clf.best_params_)

# Set the estimator to the best parameters
hgb = HistGradientBoostingClassifier(**gs_clf.best_params_)

# Show one result
print_stats(hgb)

Fitting 3 folds for each of 2016 candidates, totalling 6048 fits
{'l2_regularization': 0.0, 'learning_rate': 0.7, 'max_features': 0.5, 'max_iter': 100, 'max_leaf_nodes': 2}


('Score: 0.8885869565217391', 'AUC_ROC: 0.6817055909412597')

### AdaBoostClassifier Optimization

In [253]:
ab = AdaBoostClassifier(algorithm='SAMME')
ab.get_params()

{'algorithm': 'SAMME',
 'estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [254]:
# Define a grid of hyperparameters
ab_grid = {
    'learning_rate': np.linspace(0.1, 1.0, 5),
    'n_estimators': list(range(40, 161, 5))
}
ab = AdaBoostClassifier(algorithm='SAMME')

# Find the best parameters
gs_clf = GridSearchCV(estimator=ab,
                      param_grid=ab_grid,
                      n_jobs=-1,
                      cv=30, # Due to randomization
                      verbose=3,
                      scoring='roc_auc',
                      return_train_score=True)
gs_clf.fit(X_train, y_train)
print(gs_clf.best_params_)

# Set the estimator to the best parameters
ab = AdaBoostClassifier(algorithm='SAMME', **gs_clf.best_params_)

# Show one result
print_stats(ab)

Fitting 30 folds for each of 125 candidates, totalling 3750 fits
{'learning_rate': 1.0, 'n_estimators': 110}


('Score: 0.8804347826086957', 'AUC_ROC: 0.6539278131634819')

### Putting it all together

In [262]:
estimators = [
    ('rf', rf),
    ('gnb', gnb),
    ('hgb', hgb),
    ('ab', ab)
]
clf = StackingClassifier(estimators=estimators, 
                         final_estimator=GaussianNB())
print_stats(clf)

('Score: 0.8396739130434783', 'AUC_ROC: 0.7143783911299836')

In [263]:
print_stats(clf)

('Score: 0.8505434782608695', 'AUC_ROC: 0.728414720452937')

In [264]:
print_stats(clf)

('Score: 0.8369565217391305', 'AUC_ROC: 0.7204529370134466')

In [265]:
print_stats(clf)

('Score: 0.842391304347826', 'AUC_ROC: 0.7159707478178817')

In [266]:
print_stats(clf)

('Score: 0.8478260869565217', 'AUC_ROC: 0.726822363765039')

The model is consistently scoring within our goal. I count this as a success.

## Saving the model

In [267]:
import pickle

pickle.dump(clf, open('../models/stacking_classifier_optimized.pkl', 'wb'))

In [268]:
test_imported_clf = pickle.load(open('../models/stacking_classifier_optimized.pkl', 'rb'))

In [269]:
type(test_imported_clf)

sklearn.ensemble._stacking.StackingClassifier

In [270]:
print_stats(test_imported_clf)

('Score: 0.8478260869565217', 'AUC_ROC: 0.726822363765039')