# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Feature Selection variation
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

# 2. Feature Selection test

In [3]:
def run_models(x, y, xt, yt, variation, model):
    x_train = x.copy()
    x_test = xt.copy()
       
    if variation == 'correlation':
        x_train.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        x_test.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        
    if variation == 'keep_yellow_only':
        x_train.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
               'income_category', 'married', 'single','divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
           'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
    if variation == 'keep_yellow_blue':
        x_train.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
                     'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
             'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)  
    
    # Instantiate RandomClassifier, fit and predict
    oversampler = SMOTE(random_state=2021)
    x_train, y_train = oversampler.fit_resample(x_train, y)
    
    model.fit(x_train, y_train.values.ravel() )
    y_pred = model.predict(x_test)
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

In [4]:
from sklearn.linear_model import LogisticRegression

variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation, LogisticRegression(solver='lbfgs', max_iter=500, random_state=2021))

-------------------------TEST SCORES for base-----------------------
Recall: 0.6923076923076923
Precision: 0.41589648798521256
F2-Score: 0.6110809342748506
Accuracy score: 0.7946692991115498
AUC Score: 0.7532673088228644



KeyError: "['avg_open_to_buy' 'customer_age' 'total_trans_count' 'months_on_book'] not found in axis"

In [None]:
variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation, RandomForestClassifier(random_state=2021))