# Bank Classification Project - Modeling

In [None]:
# Export in dataset
import pickle

# pickle.dump(bank_data, open('data/bank_data.pkl', 'wb'))
# bank_data = pickle.load(open('data/bank_data.pkl', 'rb'))

In [None]:
# Baseline model
most_common = bank_data.y.mode()
count = pd.Series(y_test).value_counts()
baseline = count[0] / len(y_test)
print('Baseline accuracy: %.3f' % (baseline))

In [None]:
# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forests": RandomForestClassifier(),
    "k-Nearest Neighbors": KNeighborsClassifier()
}

results = []
names = []

for k, v in models.items():
    cv_scores = cross_val_score(estimator=v,
                       X=X_train_std,
                       y=y_train,
                       cv=10,
                       n_jobs=1)
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

I'm going to focus in on tuning the Logistic Regression classifier.

In [None]:
# cross_val_score returns a list of the scores, which we can visualize
# to get a reasonable estimate of our classifier's performance

cv_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10)
sns.distplot(cv_scores)
plt.title('Average score: {}'.format(np.mean(cv_scores)))

In [None]:
# Use the learning curve to check for overfitting
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(penalty='l2', random_state=0))])

train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,
                                                        X=X_train,
                                                        y=y_train,
                                                        train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=10,
                                                        n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.0])
plt.tight_layout()
# plt.savefig('./images/learning_curve.png', dpi=300)
plt.show()

In [None]:
# Examine hyperparameters
from sklearn.model_selection import validation_curve

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='clf__C', 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='blue')

plt.plot(param_range, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.tight_layout()
plt.savefig('./images/validation_curve.png', dpi=300)
plt.show()

In [None]:
# Grid search with a stratified KFold
from sklearn.model_selection import GridSearchCV, StratifiedKFold

parameter_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cross_validation = StratifiedKFold(n_splits=10)

gs = GridSearchCV(pipe_lr,
                  param_grid=parameter_grid,
                  cv=cross_validation)

gs.fit(X_train, y_train)
print('Best score: {}'.format(gs.best_score_))
print('Best parameters: {}'.format(gs.best_params_))

In [None]:
clf = gs.best_estimator_
clf.fit(X_train_std, y_train)
print('Test accuracy: %.3f' % clf.score(X_test_std, y_test))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

gs.fit(X_train_std, y_train)
y_pred = gs.predict(X_test_std)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

In [None]:
# Pretty confusion matrix
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

plt.xlabel('predicted label')
plt.ylabel('true label')

plt.tight_layout()
plt.savefig('./images/confusion_matrix.png', dpi=300)
plt.show()

In [None]:
# Precision, recall and F1
from sklearn.metrics import precision_score, recall_score, f1_score

print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

In [None]:
# Plot ROC curve
from sklearn.metrics import roc_curve, auc

y_score = LogisticRegression().fit(X_train_std, y_train).decision_function(X_test_std)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='Random Guessing')
plt.plot([1, 1], [1, 1], color='black', lw=lw, linestyle=':', label='Perfect Performance')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
# AUC vs. Accuracy
from sklearn.metrics import roc_auc_score, accuracy_score

y_labels = LogisticRegression().fit(X_train_std, y_train).predict(X_test_std)
y_probas = LogisticRegression().fit(X_train_std, y_train).predict_proba(X_test_std)[:, 1]

print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_probas))
print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_labels))

I will definitely need to do more feature engineering or use crazy optimized algorithms like XGBoost or Deep Learning in order to get >90% Accuracy. The pipeline is flexible enough to handle new data from additional campaigns when that data set is ready. The out-of-sample testing data accuracy score was extremely close to the cross-validation accuracy scores. Our model does not overfit and generalizes well for unseen data.

## Written Analysis

In [None]:
## See Repository README

## Additional Analysis

- Precision-Recall Curve
- Feature Engineering
- Voting Ensembles
- XGBoost
- Deep Learning