# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Support Vector Machine
---

# 1. Importing packages & libraries

In [1]:
# import pyforest
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, fbeta_score, roc_auc_score
from sklearn.metrics import roc_curve, auc

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.svm import SVC

# 2. Reading file & tidying up columns

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# 3. Train test split

In [3]:
y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

In [7]:
def run_models(train_x, y_train, test_x, y_test, variation):

    x_train = train_x.copy()
    x_test = test_x.copy()
    
    if variation == "correlation":
        x_train.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        x_test.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        
    if variation == "keep_yellow_only":
        x_train.drop( ['total_trans_count', 'months_on_book', 'card_category', 'education_level', 'income_category', 'married', 'single','divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book', 'card_category', 'education_level', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
    if variation == "keep_yellow_blue":
        x_train.drop( ['total_trans_count', 'months_on_book', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book', 'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
    # need to explore more on various SVM & hyperparameter tuning (WIP)
    svm = SVC(random_state=2021)
    
    oversampler = SMOTE(random_state=2021)
    x_train, y_train = oversampler.fit_resample(x_train, y_train)

    svm.fit(x_train, np.ravel(y_train))
    y_pred = svm.predict(x_test)       
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

In [8]:
variations = ['correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation)

-------------------------TEST SCORES for correlation-----------------------
Recall: 0.72
Precision: 0.5064935064935064
F2-Score: 0.6640181611804766
Accuracy score: 0.8425468904244817
AUC Score: 0.7929805996472663

-------------------------TEST SCORES for keep_yellow_only-----------------------
Recall: 0.7815384615384615
Precision: 0.5759637188208617
F2-Score: 0.7294658242389432
Accuracy score: 0.8726554787759131
AUC Score: 0.8358015646904536

-------------------------TEST SCORES for keep_yellow_blue-----------------------
Recall: 0.72
Precision: 0.5064935064935064
F2-Score: 0.6640181611804766
Accuracy score: 0.8425468904244817
AUC Score: 0.7929805996472663



# 6. Model Evaluation

In [None]:
# print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

In [None]:
# cm = confusion_matrix(y_test, y_pred, labels=svm.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = svm.classes_)
# disp.plot()

In [None]:
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
# print("AUC:", auc(false_positive_rate, true_positive_rate))
# print("Accuracy:", accuracy_score(y_test, y_pred.round()))
# print("Precision:", precision_score(y_test, y_pred.round()))
# print("Recall:", recall_score(y_test, y_pred.round(), average='macro'))
# print("f1_score:", f1_score(y_test, y_pred, average='macro'))

## PCA on Train Set

### Analysis of Explained Variance Ratio w.r.t Number of Components

In [None]:
# pca_test = PCA(n)
# pca_test.fit(x_train)
# sns.set(style='whitegrid')
# plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')
# evr = pca_test.explained_variance_ratio_
# cvr = np.cumsum(pca_test.explained_variance_ratio_)
# pca_df = pd.DataFrame()
# pca_df['Cumulative Variance Ratio'] = cvr
# pca_df['Explained Variance Ratio'] = evr
# display(pca_df)

# # decision boundary of at least 90% of cumulative explained variance
# plt.axhline(color='r', y=0.9)

# # ideal number of components -> 4 components 
# plt.axvline(color='r', linewidth=4, linestyle='--', x=4)