# C.7.16 앙상블 보팅 모델

In [None]:
# 아래 두 줄 코딩 즉 xgboost와 lightgbm 설치 구문 중 하나라도 주피터 노트북에서 설치에러 발생시에는
# 구글 코랩에서 이 ipynb 파일을 열고 실행할 것을 추천함

In [None]:
# 그 때 2014DC2.csv 등의 데이터 파일을 구글 드라이브에 업로딩하고 이를 구글 코랩에서 불러오기 위해서는 
# 책 본문 4.7.6의 텐서플로 케라스 절의 초반 안내 사항 준수 필요

In [1]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.1-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.1
Note: you may need to restart the kernel to use updated packages.


# 트리 계열 보팅 앙상블

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('2014DC2.csv')
df.shape

(12417, 75)

In [2]:
data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 변수만 data 데이터프레임에 저장
target = df['EBizSystem2']                # 타겟변수만 target 데이터프레임에 저장

# 50:50 Data partition
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.3임에 주의 

# interval 변수의 null value를 평균(mean)으로 impute. 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean')  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의

# 6개 트리 계열 모델 앙상블

In [4]:
# Hard Voting for all 6 tree-based models

import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier


clf_tree = DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=0) 
clf_rf = RandomForestClassifier(n_estimators=200, max_depth=19, random_state=0) 
clf_bg = BaggingClassifier(DecisionTreeClassifier(max_depth=9,random_state=0),
                           n_estimators=200,random_state=0) 
clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('tree', clf_tree),('rf', clf_rf),('bg', clf_bg),
                                          ('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='hard') 
                                       # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_tree, clf_rf, clf_bg, clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

DecisionTreeClassifier 0.68626
RandomForestClassifier 0.72041
BaggingClassifier 0.71477
GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72653


In [11]:
# Soft Voting for all 6 tree-based models

import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier


clf_tree = DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=0) 
clf_rf = RandomForestClassifier(n_estimators=200, max_depth=19, random_state=0) 
clf_bg = BaggingClassifier(DecisionTreeClassifier(max_depth=9,random_state=0),
                           n_estimators=200,random_state=0) 
clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('tree', clf_tree),('rf', clf_rf),('bg', clf_bg),
                                          ('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='soft') 
                                       # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_tree, clf_rf, clf_bg, clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

DecisionTreeClassifier 0.68626
RandomForestClassifier 0.72041
BaggingClassifier 0.71477
GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72910


In [12]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.78854


# 정확도 상위 4개 모델 앙상블

In [13]:
# Hard Voting
# 상위 4개 모델만 앙상블 

clf_rf = RandomForestClassifier(n_estimators=200, max_depth=19, random_state=0) 
clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='hard') 
                              # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_rf, clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

RandomForestClassifier 0.72041
GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72814


In [14]:
# Soft Voting
# 상위 4개 모델만 앙상블 

clf_rf = RandomForestClassifier(n_estimators=200, max_depth=19, random_state=0) 
clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='soft') 
                              # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_rf, clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

RandomForestClassifier 0.72041
GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72669


In [15]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79003


# 정확도 상위 3개 모델 앙상블

In [7]:
# Hard Voting
# Accuracy 0.72 이상인 3개 모델만 앙상블 

clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='hard') 
                              # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72814


In [9]:
# Soft Voting
# Accuracy 0.72 이상인 3개 모델만 앙상블 

clf_gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1,
                                    random_state=0)
clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('gb', clf_gb),('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='soft') 
                              # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_gb, clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

GradientBoostingClassifier 0.72298
XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72669


In [10]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79003


# 정확도 상위 2개 모델 앙상블

In [16]:
# 하드 보팅
# 정확도가 가장 높은 2개 모델만 앙상블

clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='hard') 
                              # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72588


In [17]:
# 소프트 보팅
# 정확도가 가장 높은 2개 모델만 앙상블

clf_xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10, 
                        min_child_weight=4, n_estimators=200, subsample=0.8, random_state=0)
clf_lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=10,
                         min_child_weight=2, n_estimators=200, subsample=0.4, random_state=0)
clf_voting = VotingClassifier(estimators=[('xgb',clf_xgb),('lgb',clf_lgb)],
                              n_jobs=-1, voting='soft') 
                              # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_xgb, clf_lgb, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))

XGBClassifier 0.72604
LGBMClassifier 0.72878
VotingClassifier 0.72765


In [18]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.78926


# 로지스틱 회귀 모델에 쓰인 데이터셋 활용 앙상블

In [76]:
import pandas as pd
import numpy as np
df = pd.read_csv('2014DC2_dummy_indicator_friendly.csv')   
df.shape

(12417, 193)

In [77]:
# Imputation indicator가 생성됨에 주의

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 변수만 data 데이터프레임에 저장
target = df['EBizSystem2']                # 타겟변수만 target 데이터프레임에 저장

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의 

# interval 변수의 null value를 평균(mean)으로 impute 및 add_indicator 포함 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True)  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 214)
X_test2 shape: (6209, 214)


# 6개 모델 모두 사용한 앙상블

In [40]:
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

In [22]:
# 하드 보팅
# 로지스틱 회귀, 릿지, 라소, 신경망, KNN, SVM 모델을 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

clf_lr = LogisticRegression(solver='saga',penalty='none',max_iter=10000,random_state=0)
clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)
clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=0.1,max_iter=1000,
                        hidden_layer_sizes = (100, 100), random_state=0)
clf_knn = KNeighborsClassifier(n_neighbors=39)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0)

clf_voting = VotingClassifier(estimators=[('lr', clf_lr),('rg',clf_rg),('ls', clf_ls),('mlp', clf_mlp),
                                          ('knn', clf_knn),('svm', clf_svm)],
                              n_jobs=-1, voting='hard') # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_lr, clf_rg, clf_ls, clf_mlp, clf_knn, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.72975
LogisticRegression 0.73120
LogisticRegression 0.73265
MLPClassifier 0.73104
KNeighborsClassifier 0.68385
SVC 0.72991
VotingClassifier 0.73152
Runtime of the program is 438.55333948135376


In [23]:
# 소프트 보팅
# 로지스틱 회귀, 릿지, 라소, 신경망, KNN, SVM 모델을 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

clf_lr = LogisticRegression(solver='saga',penalty='none',max_iter=10000,random_state=0)
clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)
clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=0.1,max_iter=1000,
                        hidden_layer_sizes = (100, 100), random_state=0)
clf_knn = KNeighborsClassifier(n_neighbors=39)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0, probability=True) # probability=True 구문 추가

clf_voting = VotingClassifier(estimators=[('lr', clf_lr),('rg',clf_rg),('ls', clf_ls),('mlp', clf_mlp),
                                          ('knn', clf_knn),('svm', clf_svm)],
                              n_jobs=-1, voting='soft') # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_lr, clf_rg, clf_ls, clf_mlp, clf_knn, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.72975
LogisticRegression 0.73120
LogisticRegression 0.73265
MLPClassifier 0.73104
KNeighborsClassifier 0.68385
SVC 0.72991
VotingClassifier 0.73313
Runtime of the program is 403.1785202026367


In [24]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79181


# 5개 모델 모두 사용한 앙상블 (현저히 성능이 떨어지는 KNN 제외)

In [25]:
# 소프트 보팅
# KNN 모델을 제외한 5개 모델, 즉 로지스틱 회귀, 릿지, 라소, 신경망, SVM 모델을 앙상블

clf_lr = LogisticRegression(solver='saga',penalty='none',max_iter=10000,random_state=0)
clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)
clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=0.1,max_iter=1000,
                        hidden_layer_sizes = (100, 100), random_state=0)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0, probability=True) # probability=True 구문 추가

clf_voting = VotingClassifier(estimators=[('lr', clf_lr),('rg',clf_rg),('ls', clf_ls),('mlp', clf_mlp),
                                          ('svm', clf_svm)],
                              n_jobs=-1, voting='soft') # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_lr, clf_rg, clf_ls, clf_mlp, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.72975
LogisticRegression 0.73120
LogisticRegression 0.73265
MLPClassifier 0.73104
SVC 0.72991
VotingClassifier 0.73458
Runtime of the program is 867.9558868408203


In [26]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79209


# 상위 4개 모델을 사용한 앙상블

In [42]:
# 소프트 보팅
# 상위 4개 모델, 즉 릿지, 라소, 신경망, SVM 모델을 앙상블

clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)
clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=0.1,max_iter=1000,
                        hidden_layer_sizes = (100, 100), random_state=0)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0, probability=True) # probability=True 구문 추가

clf_voting = VotingClassifier(estimators=[('rg',clf_rg),('ls', clf_ls),('mlp', clf_mlp),
                                          ('svm', clf_svm)],
                              n_jobs=-1, voting='soft') # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_rg, clf_ls, clf_mlp, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.73120
LogisticRegression 0.73265
MLPClassifier 0.73104
SVC 0.72991
VotingClassifier 0.73361
Runtime of the program is 4587.861080169678


In [43]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79216


# 상위 3개 모델을 사용한 앙상블

In [44]:
# 소프트 보팅
# 상위 3개 모델, 즉 릿지, 라소, 신경망 모델을 앙상블

clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)
clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=0.1,max_iter=1000,
                        hidden_layer_sizes = (100, 100), random_state=0)

clf_voting = VotingClassifier(estimators=[('rg',clf_rg),('ls', clf_ls),('mlp', clf_mlp)],
                              n_jobs=-1, voting='soft') # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_rg, clf_ls, clf_mlp, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.73120
LogisticRegression 0.73265
MLPClassifier 0.73104
VotingClassifier 0.73377
Runtime of the program is 4786.462224721909


In [45]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79238


# 상위 2개 모델을 사용한 앙상블

In [111]:
# 소프트 보팅
# 상위 2개 모델, 즉 릿지와 라소 모델을 앙상블

clf_rg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=10000,random_state=0)
clf_ls = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=0)

clf_voting = VotingClassifier(estimators=[('rg', clf_rg),('ls', clf_ls)],
                              n_jobs=-1, voting='soft') # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_rg, clf_ls, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

LogisticRegression 0.73120
LogisticRegression 0.73265
VotingClassifier 0.73603
Runtime of the program is 5797.99852180481


In [112]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.79220


# 라소 모델에 의해 생성된 데이터셋 활용 앙상블

In [50]:
import pandas as pd
import numpy as np
df = pd.read_csv('Lasso_select_ERP.csv')   
df.shape

(12417, 73)

In [51]:
# SimpleInputer의 add_indicator 옵션을 제거한 버전

data = df.drop(['EBizSystem2'], axis=1) # 타겟변수를 제외한 변수만 data 데이터프레임에 저장
target = df['EBizSystem2']                # 타겟변수만 target 데이터프레임에 저장

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의 

# interval 변수의 null value를 평균(mean)으로 impute 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean')  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 72)
X_test2 shape: (6209, 72)


# 라소 변수 선택 후 후속 3개 모델 모두 사용한 앙상블

In [52]:
# 하드 보팅 
# 신경망, KNN, SVM 모델을 사용한 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=1,max_iter=1000,
                        random_state=0)
clf_knn = KNeighborsClassifier(n_neighbors=35)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0)

clf_voting = VotingClassifier(estimators=[('mlp', clf_mlp),('knn', clf_knn),
                                          ('svm', clf_svm)], n_jobs=-1, voting='hard') 
                                                                        # voting='hard'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_mlp, clf_knn, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

MLPClassifier 0.72411
KNeighborsClassifier 0.68578
SVC 0.73297
VotingClassifier 0.72701
Runtime of the program is 53.437153339385986


In [None]:
# predict_proba is not available when voting='hard'

In [53]:
# 소프트 보팅 
# 신경망, KNN, SVM 모델을 사용한 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=1, max_iter=1000,
                        random_state=0)
clf_knn = KNeighborsClassifier(n_neighbors=35)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0, probability=True) # probability=True 에 주의

clf_voting = VotingClassifier(estimators=[('mlp', clf_mlp),('knn', clf_knn),
                                          ('svm', clf_svm)], n_jobs=-1, voting='soft')
                                                                        # voting='soft'에 주의
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_mlp, clf_knn, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

MLPClassifier 0.72411
KNeighborsClassifier 0.68578
SVC 0.73297
VotingClassifier 0.72492
Runtime of the program is 64.52883410453796


In [54]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.78267


# 라소 변수 선택 후 후속 2개 모델만 모두 사용한 앙상블

In [55]:
# 하드 보팅 
# KNN을 제외한 신경망, SVM 모델을 사용한 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.neural_network import MLPClassifier 
from sklearn.svm import SVC 

clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=1,max_iter=1000,
                        random_state=0)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0)

clf_voting = VotingClassifier(estimators=[('mlp', clf_mlp),('svm', clf_svm)], 
                              n_jobs=-1, voting='hard') 
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_mlp, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

MLPClassifier 0.72411
SVC 0.73297
VotingClassifier 0.72765
Runtime of the program is 46.67180562019348


In [None]:
# predict_proba is not available when voting='hard'

In [57]:
# 소프트 보팅 
# KNN을 제외한 신경망, SVM 모델을 사용한 앙상블

import time
start = time.time()

from sklearn.ensemble import VotingClassifier 
from sklearn.neural_network import MLPClassifier 
from sklearn.svm import SVC 

clf_mlp = MLPClassifier(activation='logistic',solver='sgd', alpha=1,max_iter=1000,
                        random_state=0)
clf_svm = SVC(kernel='linear', C=0.01, random_state=0, probability=True) # probability=True 에 주의
clf_voting = VotingClassifier(estimators=[('mlp', clf_mlp),('svm', clf_svm)],
                              n_jobs=-1, voting='soft') 
clf_voting.fit(X_train2, y_train)

from sklearn.metrics import accuracy_score 
for clf in(clf_mlp, clf_svm, clf_voting):
    clf.fit(X_train2, y_train)
    y_pred = clf.predict(X_test2)
    print (clf.__class__.__name__, "{:.5f}".format(accuracy_score(y_test, y_pred)))
    
end = time.time()
print(f"Runtime of the program is {end - start}")

MLPClassifier 0.72411
SVC 0.73297
VotingClassifier 0.73007
Runtime of the program is 95.35595488548279


In [58]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, clf_voting.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

ROC AUC on test set:0.78758


In [5]:
# 학습된 Classifier로 테스트 데이터셋 자료이용해서 타겟변수 예측값 생성.
pred = clf_voting.predict(X_test2)  

In [6]:
y_test_arr = y_test.to_numpy()
y_test_arr

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [7]:
df_comparison = pd.DataFrame({'y_test': y_test_arr, 'pred': pred})
df_comparison.head(20)

Unnamed: 0,y_test,pred
0,0,0
1,1,0
2,0,1
3,0,0
4,1,0
5,0,0
6,1,1
7,1,1
8,1,1
9,0,1


In [11]:
# 기존 인덱스를 삭제하고 새로운 인덱스로 리셋
X_test.reset_index(drop=True).head(3) 

Unnamed: 0,Asset2,Asset7,Asset7_ind,Asset9,Asset9_ind,B2B_purchase1,B2B_purchase1_ind,Capital1,CapitalRatio1,Compensation1_4.0,...,TAssetC1,TAssetC2,TAssetC3,TAssetC4,TAssetC5,TAssetC6,TradeMark1,emp6,emp66,emp66_ind
0,7.006695,7.682482,0,4.110874,0,9.164296,0,7.273093,0.0,0,...,3.555348,1.94591,0.0,0.0,0.0,0.0,1.386294,2.639057,0.0,0
1,6.638568,,1,5.010635,0,,1,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.791759,2.564949,,1
2,9.76967,5.899897,0,,1,9.040145,0,9.378732,4.615121,0,...,4.043051,0.0,0.0,0.0,0.0,3.044522,0.0,3.044522,0.0,0


In [9]:
dfu = pd.concat([X_test.reset_index(drop=True), df_comparison], axis=1)
dfu.head(5)

Unnamed: 0,Asset2,Asset7,Asset7_ind,Asset9,Asset9_ind,B2B_purchase1,B2B_purchase1_ind,Capital1,CapitalRatio1,Compensation1_4.0,...,TAssetC3,TAssetC4,TAssetC5,TAssetC6,TradeMark1,emp6,emp66,emp66_ind,y_test,pred
0,7.006695,7.682482,0,4.110874,0,9.164296,0,7.273093,0.0,0,...,0.0,0.0,0.0,0.0,1.386294,2.639057,0.0,0,0,0
1,6.638568,,1,5.010635,0,,1,0.0,0.0,0,...,0.0,0.0,0.0,0.0,1.791759,2.564949,,1,1,0
2,9.76967,5.899897,0,,1,9.040145,0,9.378732,4.615121,0,...,0.0,0.0,0.0,3.044522,0.0,3.044522,0.0,0,0,1
3,8.128585,7.344719,0,1.94591,0,,1,8.058644,0.0,0,...,0.0,0.0,0.0,3.332205,0.0,2.833213,,1,0,0
4,8.307953,5.446737,0,,1,8.371474,0,8.652074,0.0,0,...,0.0,0.0,0.0,0.0,0.0,1.098612,0.0,0,1,0
