# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Logistic Regression
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd

from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# from sklearn.compose import ColumnTransformer

# 2. Reading file and tidying up columns

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

# 3. Hyper paramter tuning with GridSearchCV

## 3.1 Gridsearch 1

In [3]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('smote', SMOTE(random_state = 2021)), 
    ('classifier', LogisticRegression(random_state=2021))
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'smote', 'classifier', 'scaler__clip', 'scaler__copy', 'scaler__feature_range', 'smote__k_neighbors', 'smote__n_jobs', 'smote__random_state', 'smote__sampling_strategy', 'classifier__C', 'classifier__class_weight', 'classifier__dual', 'classifier__fit_intercept', 'classifier__intercept_scaling', 'classifier__l1_ratio', 'classifier__max_iter', 'classifier__multi_class', 'classifier__n_jobs', 'classifier__penalty', 'classifier__random_state', 'classifier__solver', 'classifier__tol', 'classifier__verbose', 'classifier__warm_start'])

In [4]:
param_grid = {
    'classifier__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__fit_intercept' : [True, False],
    'classifier__max_iter' : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'classifier__multi_class' : ['auto', 'ovr', 'multinomial'],
    'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

total_combi = 1
for param, value in param_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__C [0.001, 0.01, 0.1, 1, 10, 100, 1000]
classifier__fit_intercept [True, False]
classifier__max_iter [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
classifier__multi_class ['auto', 'ovr', 'multinomial']
classifier__penalty ['l1', 'l2', 'elasticnet', 'none']
classifier__solver ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
-----------------
Total combinations: 8400


In [5]:
logreg_gridsearch = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring = 'recall',
    cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021),
    refit = True,
    n_jobs = -1
)

In [6]:
logreg_gridsearch.fit(x_train, y_train.values.ravel())

20300 fits failed out of a total of 42000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2100 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/joshuawong/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/joshuawong/miniforge3/envs/tensorflow/lib/python3.9/site-packages/imblearn/pipeline.py", line 266, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/Users/joshuawong/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.d

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2021, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('smote', SMOTE(random_state=2021)),
                                       ('classifier',
                                        LogisticRegression(random_state=2021))]),
             n_jobs=-1,
             param_grid={'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'classifier__fit_intercept': [True, False],
                         'classifier__max_iter': [100, 200, 300, 400, 500, 600,
                                                  700, 800, 900, 1000],
                         'classifier__multi_class': ['auto', 'ovr',
                                                     'multinomial'],
                         'classifier__penalty': ['l1', 'l2', 'elasticnet',
                                                 'none'],
                         'classifier__solver': [

In [7]:
best_params = logreg_gridsearch.best_params_
print(best_params)

{'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}


In [8]:
logreg_clf = logreg_gridsearch.best_estimator_

y_pred = logreg_clf.predict(x_test)

print(f"-------------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()

-------------------------TEST SCORES-----------------------
Recall: 0.7076923076923077
F2-Score: 0.6304824561403508
AUC Score: 0.7674264007597342



## 3.2 Gridsearch 2