# Combine Cross Validation with Balancing Method
Data : bankloan.csv
- Build a logistics regression model
    - Target : default
    - Features : employ, debtinc, creddebt, othdebt
- Random state 2020, ratio 80% : 20%
- Modeling evaluate by f1 score and use stratified 5-fold CV:
    - Penalized logistic regression
    - Logistic regression with SMOTE
- Which method is better?

> ## Library and Data

In [1]:
!pip install imblearn



In [18]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report, f1_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
bankloan = pd.read_csv(r'C:\Users\user\Documents\Data Science\MODUL 3\What is Classification_\bankloan.csv')
bankloan

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


> ## Data Splitting

In [4]:
x = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [5]:
x_trainval, x_test, y_trainval, y_test = train_test_split(x,
                                                         y,
                                                         stratify = y,
                                                         test_size = 0.2,
                                                         random_state = 2020)

> ## Penalized Logistic Regression

In [6]:
model = LogisticRegression(class_weight = 'balanced')
skf = StratifiedKFold(n_splits = 5)
model_cv = cross_val_score(model, x_trainval, y_trainval, cv = skf, scoring = 'f1')

In [14]:
print(model_cv)
print(model_cv.mean())
print(model_cv.std())

[0.63888889 0.58823529 0.61728395 0.58064516 0.60526316]
0.6060632905617759
0.020822978090423456


> ## Logistic Regression with SMOTE

In [8]:
smote = SMOTE()
model = LogisticRegression()

estimator = Pipeline([('balancing',smote),('clf',model)])

skf = StratifiedKFold(n_splits = 5) 

In [12]:
model_smote_cv = cross_val_score(estimator, x_trainval, y_trainval, cv = skf, scoring = 'f1')

In [15]:
print(model_smote_cv)
print(model_smote_cv.mean())
print(model_smote_cv.std())

[0.65714286 0.61538462 0.625      0.6031746  0.61333333]
0.6228070818070818
0.01851503979634336


The better model is the penalized model

> ## Final Model Performance

In [17]:
smote = SMOTE()
model =  LogisticRegression()

estimator = Pipeline([('balancing',smote),('clf',model)])

estimator.fit(x_trainval,y_trainval)
y_pred = estimator.predict(x_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       103
           1       0.50      0.81      0.62        37

    accuracy                           0.74       140
   macro avg       0.71      0.76      0.71       140
weighted avg       0.80      0.74      0.75       140



_______
# Hyperparameter Tuning
Data : bankloan.csv
- Build a logistics regression model
    - Target : default
    - Features : employ, debtinc, creddebt, othdebt
- Random state 2020, ratio 80% : 20%
- Modeling evaluate by f1 score and use stratified 5-fold CV:
- Logistic Regression with SMOTE optimize the k-neighbour, optimize c solver
- Compare the result before and after

> ## Model

In [43]:
# model
smote = SMOTE()
model = LogisticRegression()

estimator = Pipeline([('balancing', smote),('clf', model)])

# evaluation method
skf = StratifiedKFold(n_splits = 5) 

# hyperparameter
param = {
    'balancing__k_neighbors':[2,5,10,15,20],
    'clf__C':[100, 10, 1, 0.1, 0.01, 0.001],
    'clf__solver':['lbfgs', 'liblinear', 'newton-cg']
}

# hyperparameter tuning
grid_search = GridSearchCV(estimator, param_grid = param, cv = skf, scoring = 'f1', n_jobs = -1)

In [44]:
grid_search.fit(x_trainval, y_trainval)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('balancing', SMOTE()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'balancing__k_neighbors': [2, 5, 10, 15, 20],
                         'clf__C': [100, 10, 1, 0.1, 0.01, 0.001],
                         'clf__solver': ['lbfgs', 'liblinear', 'newton-cg']},
             scoring='f1')

In [33]:
print('best score :', grid_search.best_score_)
print('best param :', grid_search.best_params_)

best score : 0.6186600102406554
best param : {'balancing__k_neighbors': 20, 'clf__C': 0.1, 'clf__solver': 'lbfgs'}


In [35]:
tuning_result = pd.DataFrame(grid_search.cv_results_)

tuning_result[tuning_result['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balancing__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
81,0.043884,0.025939,0.006382,0.002325,20,0.1,lbfgs,"{'balancing__k_neighbors': 20, 'clf__C': 0.1, ...",0.638889,0.603175,0.625,0.612903,0.613333,0.61866,0.012253,1


In [37]:
tuning_result[(tuning_result['param_balancing__k_neighbors'] == 10) & (tuning_result['param_clf__C'] == 1) & (tuning_result['param_clf__solver'] == 'lbfgs')]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balancing__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
42,0.045878,0.023746,0.00758,0.003064,10,1,lbfgs,"{'balancing__k_neighbors': 10, 'clf__C': 1, 'c...",0.628571,0.5625,0.6,0.645161,0.605263,0.608299,0.028091,37


> ## Comparison (Before & After)

In [38]:
smote = SMOTE()
model = LogisticRegression()

estimator = Pipeline([('balancing', smote),('clf', model)])

In [41]:
estimator.fit(x_trainval, y_trainval)
y_pred = estimator.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.70      0.79       103
           1       0.48      0.78      0.60        37

    accuracy                           0.72       140
   macro avg       0.69      0.74      0.69       140
weighted avg       0.79      0.72      0.74       140



In [42]:
estimator = grid_search.best_estimator_

estimator.fit(x_trainval, y_trainval)
y_pred = estimator.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.71      0.81       103
           1       0.52      0.89      0.66        37

    accuracy                           0.76       140
   macro avg       0.74      0.80      0.74       140
weighted avg       0.84      0.76      0.77       140

