In [1]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('../datasets/BankChurners.csv')
list = ['Attrition_Flag', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Contacts_Count_12_mon', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Months_on_book']
data = data[list]
np.random.seed(42)
random.seed(42)

In [3]:
object_columns = data.select_dtypes('object').columns
for i in object_columns:
    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']



## Under Sampling

In [4]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()
        
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    xgb = XGBClassifier(random_state=42)
    xgb.fit(x_train_ss, y_train)
    pred = xgb.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("XGB Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

XGB Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.977153,0.981754,0.972595,0.9615
TomekLinks,0.976009,0.981754,0.970332,0.959526
NeighbourhoodCleaningRule,0.974919,0.972337,0.977515,0.958045
EditedNearestNeighbours,0.971768,0.962331,0.981393,0.95311
AllKNN,0.971174,0.961742,0.980792,0.952122
RepeatedEditedNearestNeighbours,0.968647,0.954679,0.98303,0.948174
NearMiss,0.918851,0.866392,0.978073,0.871668


## Over Sampling

In [5]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        xgb = XGBClassifier(random_state=42)
        xgb.fit(x_train_ss, y_train)
        pred = xgb.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("XGB Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

XGB Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTE(0.3),0.970632,0.962919,0.978469,0.951135
SMOTE(0.4),0.959419,0.932313,0.988147,0.93386
ADASYN(0.3),0.959153,0.932902,0.986924,0.933366
BorderlineSMOTE(0.3),0.953865,0.918776,0.991741,0.925469
BorderlineSMOTE(0.4),0.93818,0.888758,0.993421,0.901777
SMOTE(0.6),0.936725,0.888758,0.990164,0.899309
SMOTE(0.5),0.927518,0.869923,0.99328,0.885982
BorderlineSMOTE(0.5),0.924238,0.865215,0.991903,0.881046
SMOTE(0.7),0.912752,0.840494,0.998601,0.865252
ADASYN(0.4),0.903951,0.828134,0.99505,0.852419


## Combine Sampling

In [6]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    xgb = XGBClassifier(random_state=42)
    xgb.fit(x_train_ss, y_train)
    pred = xgb.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("XGB Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

XGB Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEENN,0.916375,0.848146,0.996542,0.870188
SMOTETomek,0.871981,0.77575,0.995468,0.808983


## Pipeline을 통한 Combine Sampling

In [7]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

xgb = XGBClassifier()

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, xgb)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("XGB Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

XGB Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomekLinks(0.3),0.970614,0.962331,0.979042,0.951135
SMOTEOneSidedSelection(0.3),0.968815,0.959976,0.977818,0.948174
SMOTEEditedNearestNeighbours(0.3),0.967974,0.951736,0.984775,0.947187
SMOTENeighbourhoodCleaningRule(0.3),0.966895,0.954091,0.980048,0.945212
SMOTEAllKNN(0.3),0.963877,0.942319,0.986445,0.94077
SMOTEOneSidedSelection(0.4),0.963008,0.942319,0.984625,0.939289
SMOTETomekLinks(0.4),0.962451,0.942908,0.982822,0.938302
BorderlineSMOTENeighbourhoodCleaningRule(0.3),0.962094,0.941142,0.984,0.937808
BorderlineSMOTETomekLinks(0.3),0.961979,0.938199,0.986997,0.937808
ADASYNOneSidedSelection(0.3),0.961643,0.937022,0.987593,0.937315


## Under, Over, Combine Sampling과 Pipeline을 통한 Combine Sampling 전체 결과

In [8]:
sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i in [OneSidedSelection, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]:
        sampling = i(random_state=42)
    else:
        sampling = i()

    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    xgb = XGBClassifier(random_state=42)
    xgb.fit(x_train_ss, y_train)
    pred = xgb.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("XGB Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

XGB Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.977153,0.981754,0.972595,0.9615
TomekLinks,0.976009,0.981754,0.970332,0.959526
NeighbourhoodCleaningRule,0.974919,0.972337,0.977515,0.958045
EditedNearestNeighbours,0.971768,0.962331,0.981393,0.95311
AllKNN,0.971174,0.961742,0.980792,0.952122
SMOTETomekLinks(0.3),0.970614,0.962331,0.979042,0.951135
SMOTEOneSidedSelection(0.3),0.968815,0.959976,0.977818,0.948174
RepeatedEditedNearestNeighbours,0.968647,0.954679,0.98303,0.948174
SMOTEEditedNearestNeighbours(0.3),0.967974,0.951736,0.984775,0.947187
SMOTENeighbourhoodCleaningRule(0.3),0.966895,0.954091,0.980048,0.945212


## No Sampling

In [9]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
NO_Sampling = pd.DataFrame(columns=columns)

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.fit_transform(x_test)


xgb = XGBClassifier(random_state=42)
xgb.fit(x_train, y_train)
pred = xgb.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

NO_Sampling.loc['No Sampling'] = [f1, recall, precision, acc]

print("XGB Model's score by sampling")
NO_Sampling.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
NO_Sampling.head(10)

XGB Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
No Sampling,0.97481,0.9794,0.970262,0.957552
