In [1]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.svm import SVC

import random
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('../datasets/BankChurners_change.csv', encoding='CP949')
data = data.iloc[:,2:23]
np.random.seed(42)
random.seed(42)

In [3]:
object_columns = data.select_dtypes('object').columns

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']

category : [0 1]
classes : ['F' 'M']

category : [0 1 2 3 4 5 6]
classes : ['College' 'Doctorate' 'Graduate' 'High School' 'Post-Graduate'
 'Uneducated' 'Unknown']

category : [0 1 2 3]
classes : ['Divorced' 'Married' 'Single' 'Unknown']

category : [0 1 2 3 4 5]
classes : ['$120K +' '$40K - $60K' '$60K - $80K' '$80K - $120K' 'Less than $40K'
 'Unknown']

category : [0 1 2 3]
classes : ['Blue' 'Gold' 'Platinum' 'Silver']

category : [0 1 2 3 4]
classes : ['20대' '30대' '40대' '50대' '60대 이상']



## XGBClassifier
##### 설명 참고 : https://wooono.tistory.com/97

In [4]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(random_state=42)

xgb_param_grid = {'n_estimators' : [100, 200],
                'learning_rate' : [0.01, 0.05, 0.1],
                'max_depth' : [3, 5, 7],
                'gamma' : [0, 1, 2]}

xgb_grid = GridSearchCV(xgb, param_grid=xgb_param_grid, scoring='f1', verbose=0, n_jobs=1)
xgb_grid.fit(x_train, y_train)

print(f'best f1 : {xgb_grid.best_score_}')
print('best param : ', xgb_grid.best_params_)

## 참고 : https://cjh34544.tistory.com/m/4
## http://aispiration.com/model/model-python-xgboost-hyper.html

best f1 : 0.9843149900006646
best param :  {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}


## Logistic

In [5]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

lr = LogisticRegression(random_state=42)

lr_param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10],
                'penalty' : ['l1', 'l2']}

lr_grid = GridSearchCV(lr, param_grid=lr_param_grid, scoring='f1', verbose=0, n_jobs=1)
lr_grid.fit(x_train, y_train)

print(f'best f1 : {lr_grid.best_score_}')
print('best param : ', lr_grid.best_params_)

# 참고 : https://wikidocs.net/16594


best f1 : 0.9450316395291931
best param :  {'C': 0.1, 'penalty': 'l2'}


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.94486866        nan 0.94476582]


## RandomForest

In [6]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

rf = RandomForestClassifier(random_state=42)

rf_param_grid = {'n_estimators' : [100, 200],
                'max_depth' : [3, 5, 7],
                'min_samples_leaf' : [8, 12, 16],
                'min_samples_split' : [8, 16, 20]}

rf_grid = GridSearchCV(rf, param_grid=rf_param_grid, scoring='f1', verbose=0, n_jobs=1)
rf_grid.fit(x_train, y_train)

print(f'best f1 : {rf_grid.best_score_}')
print('best param : ', rf_grid.best_params_)

## 참고 : https://techblog-history-younghunjo1.tistory.com/102
## https://jaaamj.tistory.com/35


best f1 : 0.9627504235424356
best param :  {'max_depth': 7, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 200}


In [8]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)

print(f'best f1 : {rf_grid.best_score_}')
print('best param : ', rf_grid.best_params_)

## 참고 : https://techblog-history-younghunjo1.tistory.com/102
## https://jaaamj.tistory.com/35


best f1 : 0.977865212656934
best param :  {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


## SVM

In [10]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

svc = SVC(random_state=42)

svc_param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10],
                'gamma' : [0.001, 0.01, 0.1, 1, 10]}

svc_grid = GridSearchCV(svc, param_grid=svc_param_grid, scoring='f1', verbose=0, n_jobs=1)
svc_grid.fit(x_train, y_train)

print(f'best f1 : {svc_grid.best_score_}')
print('best param : ', svc_grid.best_params_)

best f1 : 0.9611383936246669
best param :  {'C': 10, 'gamma': 0.01}


### CatBoostClassifier는 파라미터 조정이 성능에 크게 영향을 미치지 않는다는 말이 많아 일단 생략함
##### https://velog.io/@jus6886/Catboost
##### https://undeadkwandoll.tistory.com/61
##### https://www.kci.go.kr/kciportal/ci/sereArticleSearch/ciSereArtiView.kci?sereArticleSearchBean.artiId=ART002698429
#### CatBoost 설명
##### https://dailyheumsi.tistory.com/136
##### https://techblog-history-younghunjo1.tistory.com/199
##### https://heeya-stupidbutstudying.tistory.com/43?category=950711


In [11]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(x_train, y_train)
pred = cat.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print(f1)

0.9830210772833725


#### lightGBM은 10000개 이하의 데이터에 overfitting하기 쉬워서 사용 x
##### https://nurilee.com/2020/04/03/lightgbm-definition-parameter-tuning/
##### https://mac-user-guide.tistory.com/79

## Voting
##### 코드 참고 : https://eunki.tistory.com/60
##### https://nonmeyet.tistory.com/entry/Python-Voting-Classifiers%EB%8B%A4%EC%88%98%EA%B2%B0-%EB%B6%84%EB%A5%98%EC%9D%98-%EC%A0%95%EC%9D%98%EC%99%80-%EA%B5%AC%ED%98%84


##### 모델 4개 사용한 hard voting

In [12]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=0, learning_rate=0.1, max_depth=7, n_estimators=200)
lr = LogisticRegression(C=0.1, penalty='l2')
rf = RandomForestClassifier(verbose=0)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('lr', lr), ('rf', rf), ('cat', cat)], weights=[2, 1, 2, 2], voting='hard')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9649555774925962 recall : 0.985285462036492, precision : 0.9732558139534884, f1 : 0.9792336940625913


#### 모델 4개 사용한 soft voting

In [13]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
lr = LogisticRegression(C=0.1, penalty='l2')
rf = RandomForestClassifier(verbose=0)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('lr', lr), ('rf', rf), ('cat', cat)], weights=[2, 1, 2, 2], voting='soft')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9684106614017769 recall : 0.9882283696291937, precision : 0.9744631456761462, f1 : 0.9812974868497955


#### 모델 3개 사용한 hard voting

In [14]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
lr = LogisticRegression(C=0.1, penalty='l2')
rf = RandomForestClassifier(verbose=0)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('rf', rf), ('cat', cat)], weights=[1, 1, 1], voting='hard')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc = roc_auc_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9693978282329714 recall : 0.9888169511477339, precision : 0.9750435287289612, f1 : 0.9818819403857393


#### 모델 3개 사용한 soft voting

In [15]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
rf = RandomForestClassifier(verbose=0)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('rf', rf), ('cat', cat)], weights=[1, 1, 1], voting='soft')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc = roc_auc_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9689042448173741 recall : 0.987051206592113, precision : 0.9761350407450524, f1 : 0.9815627743634768


## 모델 2개 사용한 hard voting

In [18]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat)], voting='hard')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9684106614017769 recall : 0.9811653914067098, precision : 0.9811653914067098, f1 : 0.9811653914067098


## 모델 2개 사용한 soft voting

In [19]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat)], voting='soft')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9684106614017769 recall : 0.985285462036492, precision : 0.9772329246935202, f1 : 0.981242672919109


## 모델 2개 사용한 soft voting (성능이 제일 좋은 catboost에 가중치를  조금 더 줬을 경우, 가중치 3이상은 결과 변화 x)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

tl = TomekLinks()
x_train, y_train = tl.fit_resample(x_train, y_train)

xgb = XGBClassifier(gamma=1, learning_rate=0.1, max_depth=5, n_estimators=200)
cat = CatBoostClassifier(verbose=0)

model = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat)], weights=[1, 2], voting='soft')
model.fit(x_train, y_train)
pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('accuracy : {0} recall : {1}, precision : {2}, f1 : {3}'.format(acc, recall, precision, f1))

accuracy : 0.9679170779861797 recall : 0.9846968805179518, precision : 0.9772196261682243, f1 : 0.9809440046907066


#### 평가지표로 f1 score를 쓰는 이유
##### https://towardsdatascience.com/read-this-before-using-roc-auc-as-a-metric-c84c2d5af621
##### https://stackoverflow.com/questions/44172162/f1-score-vs-roc-auc
##### https://neptune.ai/blog/f1-score-accuracy-roc-auc-pr-auc

#### Boosting Model 비교
##### https://medium.com/@divyagera2402/boosting-algorithms-adaboost-gradient-boosting-xgb-light-gbm-and-catboost-e7d2dbc4e4ca
##### http://dmqm.korea.ac.kr/activity/seminar/323
##### https://hyunlee103.tistory.com/25
##### https://neptune.ai/blog/when-to-choose-catboost-over-xgboost-or-lightgbm




#### 라벨인코딩 vs 원핫인코딩
##### https://wyatt37.tistory.com/11
##### https://hye-z.tistory.com/16?category=501972


#### 한 것요약
##### 원핫인코딩은 차원을 늘려 과적합되기 쉽다. 물론 규제로 어느정도 커버할 수 있으나 트리계열에서는 해당 변수가 아예 제외되는 문제점을 가진다. 차원의 저주 등등
##### catboost는 파라미터 수정을 하지 않아도 효과가 나쁘지 않다
##### lightgbm은 데이터의 수가 너무 적어 사용할 수 없다
##### ROC가 F1보다 불균형 데이터 셋에 대해 관대한? 경향이 있어서 불균형 데이터 셋에는 F1을 평가지표로 사용한다.
