In [None]:
# part 2 - classification
# 1. feature engineering
# 2. variable selection
# 3. training and validation
# 4. testing

In [None]:
df = pd.read_excel("/data/data_cleaned.xlsx")

***

In [None]:
# create new feature - BILL_AMT_TOTAL
# total bill amount over last 6 mo
df['BILL_AMT_TOTAL'] = df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].sum(axis=1)

# check changes
df.head(1)

In [None]:
# create new feature - PAY_AMT_TOTAL
# total payment amount over last 6 mo
df['PAY_AMT_TOTAL'] = df[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].sum(axis=1)

# check changes
df.head(1)

In [None]:
# create new feature - OUTS_AMT_TOTAL
# outstanding bill amount over last 6 mo
df['OUTS_AMT_TOTAL'] = df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].sum(axis=1) \
                       - df[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].sum(axis=1)
# check changes
df.head(1)

In [None]:
# create new feature - PAST_DEFAULTS
# number of defaulted payments (PAY_AMTn = 0) over last 6 mo
df['PAST_DEFAULTS'] = df[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].apply(lambda x: (x == 0).sum(), axis=1)

# check changes
df.head(1)

In [None]:
# create new feature - EXCEED_LIMIT_BAL
# number of times exceeded limit balance over last 6 mo
# df[df['BILL_AMT1']>df['LIMIT_BAL']]
df['EXCEED_LIMIT_BAL'] = df[['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']] \
                        .apply(lambda col: sum([val > list(col)[0] for idx, val in enumerate(list(col))]), axis=1)
# check changes
df.head(1)

In [None]:
# convert data types - SEX, EDUCATION, MARRIAGE
df2 = df.copy()
dtype= {'SEX'       : 'category', 
        'EDUCATION' : 'category',
        'MARRIAGE'  : 'category'}

df2 = df2.astype(dtype)

In [None]:
# one-hot encoding
df3 = pd.get_dummies(df2)
df3.head(1)

In [None]:
df3.columns

***

In [None]:
# features
feat = ['LIMIT_BAL', 'AGE', 
       'BILL_AMT_TOTAL', 'PAY_AMT_TOTAL', 'OUTS_AMT_TOTAL', 'PAST_DEFAULTS', 'EXCEED_LIMIT_BAL',
       'SEX_1', 'SEX_2',
       'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4',
       'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3']

# train test split
X = df3[feat]
y = df3['RESPONSE']
X, y = SMOTE().fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train.shape)
print(X_test.shape)

In [None]:
# data scaling
# get numeric columns
num_col = X_train.select_dtypes(include=np.number).columns.tolist()

# scaler
scaler = MinMaxScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_test[num_col] = scaler.fit_transform(X_test[num_col])

***

In [None]:
# supervised learning model - logistic regression
# instantiate model
model = LogisticRegression(random_state=42)

# define hyperparameters
hyperparameters = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
                   'C' : np.logspace(-4, 4, 20),
                   'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
                   'max_iter' : [100, 1000,2500, 5000]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
lr1 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
lr1.fit(X_train, y_train)

# get best model
best_lr1 = lr1.best_estimator_

# generate predictions
y_pred_lr1 = best_lr1.predict(X_train)
y_pred_proba_lr1 = best_lr1.predict_proba(X_train)[:, 1]

# metrics
print("Accuracy:", accuracy_score(y_train, y_pred_lr1))
print("ROC AUC:", roc_auc_score(y_train, y_pred_proba_lr1))

In [None]:
# feature selection - logistic regression
# ranking features with RFE
selector = RFE(best_lr1, n_features_to_select = 1)
selector = selector.fit(X_train, y_train)
# print rankings
lr_feature_ranks = []
for i in selector.ranking_:
    lr_feature_ranks.append(feat[i-1])
lr_feature_ranks

In [None]:
# supervised learning model - random forest
# instantiate model
model = RandomForestClassifier(random_state=42)

# define hyperparameters
hyperparameters = {'n_estimators'     : [350, 400, 450, 500],
                   'max_features'     : ['auto', 'sqrt'],
                   'max_depth'        : [5, 10, 15, 25, 30],
                   'min_samples_split': [2, 5, 10, 15, 100],
                   'min_samples_leaf' : [1, 2, 5, 10, 15]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
rf1 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
rf1.fit(X_train, y_train)

# get best model
best_rf1 = rf1.best_estimator_

# generate predictions
y_pred_rf1 = best_rf1.predict(X_train)
y_pred_proba_rf1 = best_rf1.predict_proba(X_train)[:, 1]

# metrics
print("Accuracy:", accuracy_score(y_train, y_pred_rf1))
print("ROC AUC:", roc_auc_score(y_train, y_pred_proba_rf1))

In [None]:
# feature selection - random forest
importances = best_rf1.feature_importances_
indices = np.argsort(importances)
rf_feature_ranks = [feat[i] for i in indices]

plt.figure(figsize=(12,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), rf_feature_ranks)
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# supervised learning model - xgboost
# instantiate the classifier
model = xgb.XGBClassifier(random_state=42)

# create a dictionary of hyperparameters to tune 
hyperparameters = {'n_estimators' : [200, 300, 400, 500],
                   'max_depth'    : [2, 4, 5, 10, 15, 25],
                   'learning_rate': [0.01, 0.05, 0.1, 1]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
xgb1 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
xgb1.fit(X_train, y_train)

# get best model
best_xgb1 = xgb1.best_estimator_

# generate predictions
y_pred_xgb1 = best_xgb1.predict(X_train)
y_pred_proba_xgb1 = best_xgb1.predict_proba(X_train)[:, 1]

# metrics
print("Accuracy:", accuracy_score(y_train, y_pred_xgb1))
print("ROC AUC:", roc_auc_score(y_train, y_pred_proba_xgb1))

In [None]:
# feature selection - xgboost
# feature importance plot
feature_important = xgb1.best_estimator_.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

xgb_feature_ranks = pd.DataFrame(data=values, index=keys, columns=["score"]).nlargest(len(feat), columns='score').sort_values(by="score", ascending=True)

plt.figure(figsize=(12,10))
plt.title('Feature Importances')
plt.barh(xgb_feature_ranks.index, xgb_feature_ranks['score'], color='b', align='center')
plt.xlabel('Relative Importance')
plt.show()

***

In [None]:
# feature selection - logistic regression
lr_X, y = SMOTE().fit_resample(X[lr_feature_ranks[:10]],y)
lr_X_train, lr_X_test, lr_y_train, lr_y_test = train_test_split(lr_X, y, test_size=0.30, random_state=42)

In [None]:
# feature selection - random forest
rf_X, y = SMOTE().fit_resample(X[rf_feature_ranks[::-1][:10]],y)
rf_X_train, rf_X_test, rf_y_train, rf_y_test = train_test_split(rf_X, y, test_size=0.30, random_state=42)

In [None]:
# feature selection - xgboost
xgb_X, y = SMOTE().fit_resample(X[xgb_feature_ranks.index[::-1][:7]],y)
xgb_X_train, xgb_X_test, xgb_y_train, xgb_y_test = train_test_split(xgb_X, y, test_size=0.30, random_state=42)

***

In [None]:
# model tuning - retrain logistic regression model
# supervised learning model - logistic regression
model = LogisticRegression(random_state=42)

# define hyperparameters
hyperparameters = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
                   'C' : np.logspace(-4, 4, 20),
                   'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
                   'max_iter' : [100, 1000,2500, 5000]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
lr2 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
lr2.fit(lr_X_train, lr_y_train)

# get best model
best_lr2 = lr2.best_estimator_

# generate predictions
y_pred_lr2 = best_lr2.predict(lr_X_train)
y_pred_proba_lr2 = best_lr2.predict_proba(lr_X_train)[:, 1]

# metrics
print("Linear Regression Classifier Metrics with Hyperparameter Tuning:")
print("Best Parameters:", lr2.best_params_)
print("Accuracy:", accuracy_score(lr_y_train, y_pred_lr2))
print("Precision:", precision_score(lr_y_train, y_pred_lr2))
print("Recall:", recall_score(lr_y_train, y_pred_lr2))
print("F1:", f1_score(lr_y_train, y_pred_lr2))
print("ROC AUC:", roc_auc_score(lr_y_train, y_pred_proba_lr2))
print("\nClassification Report:\n", classification_report(lr_y_train, y_pred_lr2))

In [None]:
# model tuning - retrain random forest model
# supervised learning model - random forest
# instantiate model
model = RandomForestClassifier(random_state=42)

# define hyperparameters
hyperparameters = {'n_estimators'     : [350, 400, 450, 500],
                   'max_features'     : ['auto', 'sqrt'],
                   'max_depth'        : [5, 10, 15, 25, 30],
                   'min_samples_split': [2, 5, 10, 15, 100],
                   'min_samples_leaf' : [1, 2, 5, 10, 15]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
rf2 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
rf2.fit(rf_X_train, rf_y_train)

# get best model
best_rf2 = rf2.best_estimator_

# generate predictions
y_pred_rf2 = best_rf2.predict(rf_X_train)
y_pred_proba_rf2 = best_rf2.predict_proba(rf_X_train)[:, 1]

# metrics
print("Random Forest Classifier Metrics with Hyperparameter Tuning:")
print("Best Parameters:", rf2.best_params_)
print("Accuracy:", accuracy_score(rf_y_train, y_pred_rf2))
print("Precision:", precision_score(rf_y_train, y_pred_rf2))
print("Recall:", recall_score(rf_y_train, y_pred_rf2))
print("F1:", f1_score(rf_y_train, y_pred_rf2))
print("ROC AUC:", roc_auc_score(rf_y_train, y_pred_proba_rf2))
print("\nClassification Report:\n", classification_report(rf_y_train, y_pred_rf2))

In [None]:
# model tuning - retrain xgboost model
# supervised learning model - xgboost
# instantiate the classifier
model = xgb.XGBClassifier(random_state=42)

# create a dictionary of hyperparameters to tune 
hyperparameters = {'n_estimators' : [200, 300, 400, 500],
                   'max_depth'    : [2, 4, 5, 10, 15, 25],
                   'learning_rate': [0.01, 0.05, 0.1, 1]
                  }

# define scoring metrics
metrics = 'accuracy'

# instantiate randomizedsearch
xgb2 = RandomizedSearchCV(model, hyperparameters, scoring=metrics, cv=5, refit='accuracy', verbose = 1, n_jobs = -1)
xgb2.fit(xgb_X_train, xgb_y_train)

# get best model
best_xgb2 = xgb2.best_estimator_

# generate predictions
y_pred_xgb2 = best_xgb2.predict(xgb_X_train)
y_pred_proba_xgb2 = best_xgb2.predict_proba(xgb_X_train)[:, 1]

# metrics
print("XGBoost Classifier Metrics with Hyperparameter Tuning:")
print("Best Parameters:", xgb2.best_params_)
print("Accuracy:", accuracy_score(xgb_y_train, y_pred_xgb2))
print("Precision:", precision_score(xgb_y_train, y_pred_xgb2))
print("Recall:", recall_score(xgb_y_train, y_pred_xgb2))
print("F1:", f1_score(xgb_y_train, y_pred_xgb2))
print("ROC AUC:", roc_auc_score(xgb_y_train, y_pred_proba_xgb2))
print("\nClassification Report:\n", classification_report(xgb_y_train, y_pred_xgb2))

In [None]:
# score comparison
scores_dict = {"Accuracy" : [accuracy_score(lr_y_train, y_pred_lr2),accuracy_score(rf_y_train, y_pred_rf2),accuracy_score(xgb_y_train, y_pred_xgb2)], 
               "Precision": [precision_score(lr_y_train, y_pred_lr2),precision_score(rf_y_train, y_pred_rf2),precision_score(xgb_y_train, y_pred_xgb2)], 
               "Recall"   : [recall_score(lr_y_train, y_pred_lr2),recall_score(rf_y_train, y_pred_rf2),recall_score(xgb_y_train, y_pred_xgb2)], 
               "F1"       : [f1_score(lr_y_train, y_pred_lr2),f1_score(rf_y_train, y_pred_rf2),f1_score(xgb_y_train, y_pred_xgb2)], 
               "ROC AUC"  : [roc_auc_score(lr_y_train, y_pred_proba_lr2),roc_auc_score(rf_y_train, y_pred_proba_rf2),roc_auc_score(xgb_y_train, y_pred_proba_xgb2)]
              }

scores_df = pd.DataFrame(data = scores_dict, 
                         index = ["linear regression", "random forest", "xgboost"])
scores_df

***

In [None]:
# fit best model with best hyperparameters
final_classifier = xgb2.best_estimator_.fit(xgb_X_train, xgb_y_train)

In [None]:
# predict on test data
y_pred_proba = final_classifier.predict_proba(xgb_X_test)[:, 1]
y_pred = final_classifier.predict(xgb_X_test)

In [None]:
# metrics
print("Final Classifier Metrics:")
print("Accuracy:", accuracy_score(xgb_y_test, y_pred))
print("Precision:", precision_score(xgb_y_test, y_pred))
print("Recall:", recall_score(xgb_y_test, y_pred))
print("F1 Score:", f1_score(xgb_y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(xgb_y_test, y_pred_proba))
target_names = ['Predicted would not leave', 'Predicted would leave']
print("Classification Report:\n",classification_report(xgb_y_test, y_pred, target_names=target_names))

In [None]:
# final classifier metrics - confusion matrix
cm = confusion_matrix(y_pred, xgb_y_test)
cmd = ConfusionMatrixDisplay.from_predictions(xgb_y_test, y_pred, normalize="true", values_format=".0%")

print(cm)
plt.show()

In [None]:
# false positive rate and true positive rate
fpr, tpr, thresholds = roc_curve(xgb_y_test, y_pred_proba)

# plot roc curve
sns.set_style("darkgrid", {"axes.facecolor": ".9"})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.feature_extraction import DictVectorizer
from eli5 import show_weights
vec = DictVectorizer()
show_weights(final_classifier, vec=vec)

***

In [None]:
# generate probabilities for original dataset (non-smoted)
y_pred_proba_fin = final_classifier.predict_proba(df2[xgb_X_test.columns])[:, 1]
y_pred_fin = final_classifier.predict(df2[xgb_X_test.columns])

In [None]:
df4 = df2.copy()
df4['DEFAULT_PROBA'] = y_pred_proba_fin
df4.head()

In [None]:
churn_top_10 = df4.nlargest(10, 'DEFAULT_PROBA')
churn_top_10.head()

In [None]:
# top losses
churn_top_10 = churn_top_10[['ID', 'BILL_AMT_TOTAL', 'DEFAULT_PROBA']]
churn_top_10['EXPECTED_LOSS'] = churn_top_10['BILL_AMT_TOTAL'] * churn_top_10['DEFAULT_PROBA']
churn_top_10.sort_values(by='EXPECTED_LOSS', ascending=False)