# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Logistic Regression
---

# 1. Importing packages and libraries

In [77]:
import pandas as pd

from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold


import warnings
warnings.filterwarnings('ignore')

# 2. Reading file and tidying up columns

In [78]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

In [79]:
def print_combis(param_grid):
    '''
    Print number of combinations in a param_grid
    '''
    total_combi = 1
    for param, value in param_grid.items():
        print(param, value)
        total_combi *= len(value)

    print('-----------------')
    print('Total combinations:', total_combi)

def print_scores(y_test, y_pred):
    '''
    Print evaluation metrics
    '''
    print(f"-------------------------TEST SCORES-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")

# 3. Hyper paramter tuning with GridSearchCV

In [80]:
# Define pipeline
pipeline = Pipeline([
    ('scaler', ColumnTransformer(transformers=[('scaler', MinMaxScaler(), x_train.columns[1:])])),
    ('smote', SMOTE(random_state = 2021)), 
    ('classifier', LogisticRegression(random_state=2021))
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'smote', 'classifier', 'scaler__n_jobs', 'scaler__remainder', 'scaler__sparse_threshold', 'scaler__transformer_weights', 'scaler__transformers', 'scaler__verbose', 'scaler__verbose_feature_names_out', 'scaler__scaler', 'scaler__scaler__clip', 'scaler__scaler__copy', 'scaler__scaler__feature_range', 'smote__k_neighbors', 'smote__n_jobs', 'smote__random_state', 'smote__sampling_strategy', 'classifier__C', 'classifier__class_weight', 'classifier__dual', 'classifier__fit_intercept', 'classifier__intercept_scaling', 'classifier__l1_ratio', 'classifier__max_iter', 'classifier__multi_class', 'classifier__n_jobs', 'classifier__penalty', 'classifier__random_state', 'classifier__solver', 'classifier__tol', 'classifier__verbose', 'classifier__warm_start'])

## 3.1 Gridsearch 1

In [81]:
param_grid = {
    'classifier__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__fit_intercept' : [True, False],
    'classifier__max_iter' : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'classifier__multi_class' : ['auto', 'ovr', 'multinomial'],
    'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

print_combis(param_grid)

classifier__C [0.001, 0.01, 0.1, 1, 10, 100, 1000]
classifier__fit_intercept [True, False]
classifier__max_iter [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
classifier__multi_class ['auto', 'ovr', 'multinomial']
classifier__penalty ['l1', 'l2', 'elasticnet', 'none']
classifier__solver ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
-----------------
Total combinations: 8400


In [82]:
logreg_gridsearch = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring = 'recall',
    cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021),
    refit = True,
    n_jobs = -1
)

In [83]:
logreg_gridsearch.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2021, shuffle=True),
             estimator=Pipeline(steps=[('scaler',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         MinMaxScaler(),
                                                                         Index(['dependent_count', 'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'total_amt_change_q4_q1', 'total_trans_amt', '...
             param_grid={'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'classifier__fit_intercept': [True, False],
                         'classifier__max_iter': [100, 200, 300, 400, 500, 600,
                                                  700, 800, 900, 1000],
                         'classifier__multi_class': ['auto', 'ovr',
                                                  

In [84]:
best_params = logreg_gridsearch.best_params_
print(best_params)

{'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}


In [85]:
logreg_clf = logreg_gridsearch.best_estimator_
y_pred = logreg_clf.predict(x_test)

print_scores(y_test, y_pred)

-------------------------TEST SCORES-----------------------
Recall: 0.7107692307692308
F2-Score: 0.630802839978154
AUC Score: 0.7672011938678606


## 3.2 Gridsearch 2

Previous best params: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}

In [86]:
param_grid2 = {
    'classifier__C' : [1, 2, 3, 4, 5],
    'classifier__fit_intercept' : [True, False],
    'classifier__max_iter' : [70, 80, 90, 100, 110, 120, 130],
    'classifier__multi_class' : ['auto', 'ovr', 'multinomial'],
    'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

print_combis(param_grid2)

classifier__C [1, 2, 3, 4, 5]
classifier__fit_intercept [True, False]
classifier__max_iter [70, 80, 90, 100, 110, 120, 130]
classifier__multi_class ['auto', 'ovr', 'multinomial']
classifier__penalty ['l1', 'l2', 'elasticnet', 'none']
classifier__solver ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
-----------------
Total combinations: 4200


In [87]:
logreg_gridsearch2 = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid2,
    scoring = 'recall',
    cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021),
    refit = True,
    n_jobs = -1
)

In [88]:
logreg_gridsearch2.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2021, shuffle=True),
             estimator=Pipeline(steps=[('scaler',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         MinMaxScaler(),
                                                                         Index(['dependent_count', 'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'total_amt_change_q4_q1', 'total_trans_amt', '...
                                        LogisticRegression(random_state=2021))]),
             n_jobs=-1,
             param_grid={'classifier__C': [1, 2, 3, 4, 5],
                         'classifier__fit_intercept': [True, False],
                         'classifier__max_iter': [70, 80, 90, 100, 110, 120,
                                                  130],
                         'classifier__multi_class': 

In [89]:
best_params2 = logreg_gridsearch2.best_params_
print(best_params2)

{'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__max_iter': 70, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}


In [90]:
logreg_clf2 = logreg_gridsearch2.best_estimator_

y_pred2 = logreg_clf2.predict(x_test)

print_scores(y_test, y_pred2)

-------------------------TEST SCORES-----------------------
Recall: 0.7107692307692308
F2-Score: 0.630802839978154
AUC Score: 0.7672011938678606


## 3.3 Gridsearch 3

Previous best params: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__max_iter': 70, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}

In [91]:
param_grid3 = {
    'classifier__C' : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__fit_intercept' : [True, False],
    'classifier__max_iter' : [10, 20, 30, 40, 50, 60, 70],
    'classifier__multi_class' : ['auto', 'ovr', 'multinomial'],
    'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

print_combis(param_grid3)

classifier__C [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
classifier__fit_intercept [True, False]
classifier__max_iter [10, 20, 30, 40, 50, 60, 70]
classifier__multi_class ['auto', 'ovr', 'multinomial']
classifier__penalty ['l1', 'l2', 'elasticnet', 'none']
classifier__solver ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
-----------------
Total combinations: 5040


In [92]:
logreg_gridsearch3 = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid3,
    scoring = 'recall',
    cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021),
    refit = True,
    n_jobs = -1
)

In [93]:
logreg_gridsearch3.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2021, shuffle=True),
             estimator=Pipeline(steps=[('scaler',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         MinMaxScaler(),
                                                                         Index(['dependent_count', 'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'total_amt_change_q4_q1', 'total_trans_amt', '...
                                        LogisticRegression(random_state=2021))]),
             n_jobs=-1,
             param_grid={'classifier__C': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                         'classifier__fit_intercept': [True, False],
                         'classifier__max_iter': [10, 20, 30, 40, 50, 60, 70],
                         'classifier__multi_class': ['auto', 'ovr',
                       

In [94]:
best_params3 = logreg_gridsearch3.best_params_
print(best_params3)

{'classifier__C': 0.5, 'classifier__fit_intercept': True, 'classifier__max_iter': 10, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'sag'}


In [95]:
logreg_clf3 = logreg_gridsearch3.best_estimator_

y_pred3 = logreg_clf3.predict(x_test)

print_scores(y_test, y_pred3)

-------------------------TEST SCORES-----------------------
Recall: 0.7169230769230769
F2-Score: 0.6328082563824009
AUC Score: 0.76792655903767


# 4. Results

**Obtained results:**

| Feature           | GridSearch1 | GridSearch2 | GridSearch3 |
|-------------------|:-----------:|:-----------:|:-----------:|
|     Recall        |      0.7107692307692308      |      0.7107692307692308      |     0.7169230769230769      |  
|     F2-Score      |      0.630802839978154      |      0.630802839978154      |     0.6328082563824009      |
|     AUC Score     |     0.7672011938678606      |      0.7672011938678606      |     0.76792655903767      |