# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Support Vector Machine
---

# 1. Importing packages & libraries

In [16]:
# import pyforest
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, fbeta_score, roc_auc_score, roc_curve, auc
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.svm import SVC

# 2. Reading file & tidying up columns

In [5]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# 3. Train test split

In [6]:
y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

# 4. Hyperparameter Tuning with GridSearchCV

### Creating Pipeline for GridSearch ( Min-max scale, SMOTE, SVM classifier )

In [34]:
pipeline = Pipeline([( "scaler" , MinMaxScaler()),
                     ("smote", SMOTE(random_state = 2021)), 
                     ("model", SVC(random_state=2021))])

### Creating parameter grid to be used for GridSearchCV

In [35]:
param_grid =  { 'model__C': [0.1, 1, 10, 100, 1000],  
               'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
               'model__kernel': ['rbf','linear','sigmoid']  }

In [36]:
total_combi = 1
for param, value in param_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

model__C [0.1, 1, 10, 100, 1000]
model__gamma [1, 0.1, 0.01, 0.001, 0.0001]
model__kernel ['rbf', 'linear', 'sigmoid']
-----------------
Total combinations: 75


### Running GridSearchCV to get best parameters

In [None]:
svm_gridsearch = GridSearchCV(pipeline, param_grid, scoring = 'recall', cv = 5, n_jobs = -1, verbose = 1)
svm_gridsearch.fit(x_train, np.ravel(y_train))

### Getting Best Parameters from GridSearchCV

In [None]:
best_params = svm_gridsearch.best_params_
print(best_params)

# 5. Model Evaluation

In [None]:
svm = svm_gridsearch.best_estimator_
y_pred = svm.predict(x_test)

print(f"-------------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()

# Misc (to be removed later)
---

In [None]:
# svm = SVC(random_state=2021)

# # Applying SMOTE for oversampling
# oversampler = SMOTE(random_state=2021)
# x_train, y_train = oversampler.fit_resample(x_train, y_train)

# svm.fit(x_train, np.ravel(y_train))
# y_pred = svm.predict(x_test) 

# print(classification_report(y_test, y_pred))

# print(f"------------------------ TEST SCORES -----------------------")
# print(f"Recall: {recall_score(y_test, y_pred)}")
# print(f"Precision: {precision_score(y_test, y_pred)}")
# print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
# print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
# print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
# print()

In [None]:
# for i in range(4):
#     # Separate data into test and training sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)# Train a SVC model using different kernal
#     svclassifier = getClassifier(i) 
#     svclassifier.fit(X_train, y_train)# Make prediction
#     y_pred = svclassifier.predict(X_test)# Evaluate our model
#     print("Evaluation:", kernals[i], "kernel")
#     print(classification_report(y_test,y_pred))

In [None]:
# def run_models(train_x, y_train, test_x, y_test, variation):

#     x_train = train_x.copy()
#     x_test = test_x.copy()
    
#     if variation == "correlation":
#         x_train.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
#         x_test.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        
#     if variation == "keep_yellow_only":
#         x_train.drop( ['total_trans_count', 'months_on_book', 'card_category', 'education_level', 'income_category', 'married', 'single','divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
#         x_test.drop( ['total_trans_count', 'months_on_book', 'card_category', 'education_level', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
#     if variation == "keep_yellow_blue":
#         x_train.drop( ['total_trans_count', 'months_on_book', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
#         x_test.drop( ['total_trans_count', 'months_on_book', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
#     # need to explore more on various SVM & hyperparameter tuning (WIP)
#     svm = SVC(random_state=2021)
    
#     oversampler = SMOTE(random_state=2021)
#     x_train, y_train = oversampler.fit_resample(x_train, y_train)

#     svm.fit(x_train, np.ravel(y_train))
#     y_pred = svm.predict(x_test)       
    
#     print(f"-------------------------TEST SCORES for {variation}-----------------------")
#     print(f"Recall: {recall_score(y_test, y_pred)}")
#     print(f"Precision: {precision_score(y_test, y_pred)}")
#     print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
#     print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
#     print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
#     print()

# variations = ['correlation', 'keep_yellow_only', 'keep_yellow_blue']

# for variation in variations:
#     run_models(x_train, y_train, x_test, y_test, variation)

In [None]:
# print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

In [None]:
# cm = confusion_matrix(y_test, y_pred, labels=svm.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = svm.classes_)
# disp.plot()

In [None]:
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
# print("AUC:", auc(false_positive_rate, true_positive_rate))
# print("Accuracy:", accuracy_score(y_test, y_pred.round()))
# print("Precision:", precision_score(y_test, y_pred.round()))
# print("Recall:", recall_score(y_test, y_pred.round(), average='macro'))
# print("f1_score:", f1_score(y_test, y_pred, average='macro'))

## PCA on Train Set

### Analysis of Explained Variance Ratio w.r.t Number of Components

In [None]:
# pca_test = PCA(n)
# pca_test.fit(x_train)
# sns.set(style='whitegrid')
# plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')
# evr = pca_test.explained_variance_ratio_
# cvr = np.cumsum(pca_test.explained_variance_ratio_)
# pca_df = pd.DataFrame()
# pca_df['Cumulative Variance Ratio'] = cvr
# pca_df['Explained Variance Ratio'] = evr
# display(pca_df)

# # decision boundary of at least 90% of cumulative explained variance
# plt.axhline(color='r', y=0.9)

# # ideal number of components -> 4 components 
# plt.axvline(color='r', linewidth=4, linestyle='--', x=4)