# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Support Vector Machine
---

# 1. Importing packages & libraries

In [7]:
# import pyforest
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, fbeta_score, roc_auc_score, roc_curve, auc
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_validate, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

# 2. Reading file & tidying up columns for train test split

In [4]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

# 3. Model performance before Hyperparameter Tuning

### Creating function to evaluate model

In [9]:
def cv_evaluate_model(rf_clf):
    smote_sampler = SMOTE(random_state=2021)

    scale_features = x_train.columns[1:]

    scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

    pipeline = Pipeline(steps = [['scaler', scaler],
                                 ['smote', smote_sampler],
                                 ['classifier', rf_clf]])

    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

    scoring = {"recall": 'recall',
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    recall = [ val for val in scores['test_recall'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    recall.append( sum(recall) / len(recall) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3','Fold 4','Fold 5', 'Average'],
                            index=['Recall','Fbeta2','AUC'])
    return score_df


### Evaluating base model

In [18]:
base_svc = SVC(random_state=2021)

results_base = cv_evaluate_model(base_svc)
display(results_base)

# retrieves average performance
results_base_avg = results_base['Average'] 
print("---- Average Results -----")
print()
display(results_base_avg)


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
Recall,0.850575,0.8,0.819231,0.784615,0.827586,0.816401
Fbeta2,0.764463,0.724739,0.754784,0.709812,0.752613,0.741282
AUC,0.856905,0.83125,0.851527,0.821352,0.849408,0.842088


---- Average Results -----



Recall    0.816401
Fbeta2    0.741282
AUC       0.842088
Name: Average, dtype: float64

# 4. Hyperparameter Tuning with GridSearchCV

## 4.1 First GridSearchCV

### Creating parameter grid to be used for GridSearchCV

In [24]:
pipeline = Pipeline([["scaler", MinMaxScaler()],
                     ["smote", SMOTE(random_state = 2021)],
                     ["model", SVC(random_state = 2021)]])

In [25]:
param_grid =  { 'model__C': [0.1, 1, 10, 100, 1000],  
               'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
               'model__kernel': ['rbf','linear','sigmoid']  }

In [30]:
total_combi = 1
for param, value in param_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Total combinations across 5-folds:', total_combi*5)

model__C [0.1, 1, 10, 100, 1000]
model__gamma [1, 0.1, 0.01, 0.001, 0.0001]
model__kernel ['rbf', 'linear', 'sigmoid']
-----------------
Total combinations: 75
Total combinations across 5-folds: 375


### Running First GridSearchCV to get best parameters

In [29]:
svc_gridsearch1 = GridSearchCV(pipeline, param_grid, scoring = 'recall', cv = 5, n_jobs = -1, verbose = 1)
svc_gridsearch1.fit(x_train, np.ravel(y_train))
best_params = svc_gridsearch1.best_params_
print("Best Parameters: ", best_params)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
Best Parameters:  {'model__C': 1000, 'model__gamma': 0.1, 'model__kernel': 'rbf'}


### Model Evaluation with best parameters from GridSearchCV 1


In [35]:
svc1 = SVC(C=1000, gamma=0.1, kernel='rbf')
svc1_results = cv_evaluate_model(svc1)
display(svc1_results)

# retrieves average performance
svc1_results_avg = svc1_results['Average'] 
print("---- Average Results -----")
print()
display(svc1_results_avg)


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
Recall,0.827586,0.8,0.826923,0.8,0.835249,0.817952
Fbeta2,0.75,0.723227,0.761331,0.718728,0.756944,0.742046
AUC,0.847617,0.830147,0.855741,0.826838,0.852135,0.842496


---- Average Results -----



Recall    0.817952
Fbeta2    0.742046
AUC       0.842496
Name: Average, dtype: float64

## 4.2 Second GridSearchCV

### Creating parameter grid to be used for GridSearchCV 2

From GridSearchCV 1, we've obtained the best parameters which are: C = 1000, gamma = 0.1, kernel = rbf. We can further narrow the search space and run a second GridSearchCV to obtain the best parameters. 


In [39]:
param_grid =  { 'model__C': [100, 1000, 10000],  
               'model__gamma': [1, 0.1, 0.01], 
               'model__kernel': ['rbf']  }

In [40]:
total_combi = 1
for param, value in param_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Total combinations across 5-folds:', total_combi*5)

model__C [100, 1000, 10000]
model__gamma [1, 0.1, 0.01]
model__kernel ['rbf']
-----------------
Total combinations: 9
Total combinations across 5-folds: 45


### Running Second GridSearchCV to get best parameters

In [41]:
svm_gridsearch2 = GridSearchCV(pipeline, param_grid, scoring = 'recall', cv = 5, n_jobs = -1, verbose = 1)
svm_gridsearch2.fit(x_train, np.ravel(y_train))
best_params = svm_gridsearch2.best_params_
print("Best Parameters: ", best_params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters:  {'model__C': 1000, 'model__gamma': 0.1, 'model__kernel': 'rbf'}


### Model Evaluation

In [42]:
svc2 = SVC(C=1000, gamma=0.1, kernel='rbf')
svc2_results = cv_evaluate_model(svc2)
display(svc2_results)

# retrieves average performance
svc2_results_avg = svc2_results['Average'] 
print("---- Average Results -----")
print()
display(svc2_results_avg)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
Recall,0.827586,0.8,0.826923,0.8,0.835249,0.817952
Fbeta2,0.75,0.723227,0.761331,0.718728,0.756944,0.742046
AUC,0.847617,0.830147,0.855741,0.826838,0.852135,0.842496


---- Average Results -----



Recall    0.817952
Fbeta2    0.742046
AUC       0.842496
Name: Average, dtype: float64