# Voting

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

y = cancer['target']
X = cancer['data']

x_tr, x_te, y_tr, y_te = train_test_split(
    X, y, stratify = y, random_state = 0
)

x_tr.shape, x_te.shape

((426, 30), (143, 30))

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [5]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)], voting = 'soft')

In [6]:
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']

for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    print(f'{name} Train Accuracy : {train_score:.2f}%')
    print(f'{name} Test Accuracy : {test_score:.2f}%')    
    print()

hard Train Accuracy : 98.12%
hard Test Accuracy : 95.10%

soft Train Accuracy : 99.53%
soft Test Accuracy : 95.80%

knn1 Train Accuracy : 94.60%
knn1 Test Accuracy : 91.61%

knn2 Train Accuracy : 95.77%
knn2 Test Accuracy : 91.61%

lr Train Accuracy : 96.71%
lr Test Accuracy : 93.71%

dt3 Train Accuracy : 97.65%
dt3 Test Accuracy : 91.61%

dt5 Train Accuracy : 100.00%
dt5 Test Accuracy : 90.91%



# Bagging

- tree계열들은 max_depth 파라미터를 무조건 가지고 있음
- 앙상블 종류 4가지도 max_depth 가지고있음
- 5, 4, 3 정도로 줄여서 성능비교..?

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(0.9976525821596244, 0.951048951048951)

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=4).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(0.9976525821596244, 0.9370629370629371)

In [9]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=3).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(0.9788732394366197, 0.9230769230769231)

In [14]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)
print('2,3,4는 test점수 차이가 별로 없음')

2,3,4는 test점수 차이가 별로 없음


# Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(1.0, 0.958041958041958)

# Stacking

In [16]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
              ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression()) # 최종모델(메타모델)

model.fit(x_tr, y_tr).score(x_te, y_te)

0.958041958041958

In [35]:
from sklearn.datasets import load_digits

from sklearn.model_selection import train_test_split
digits = load_digits()

y = digits['target']
X = digits['data']

x_tr, x_te, y_tr, y_te = train_test_split(
    X, y, stratify = y, random_state = 0, test_size=0.3
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# # K-NN
# knn = KNeighborsClassifier(n_neighbors=3).fit(x_tr, y_tr)
# print('K-NN', knn.score(x_tr, y_tr), knn.score(x_te, y_te))

# # Logistic
# lr = LogisticRegression(max_iter=10000).fit(x_tr, y_tr)
# print('Logistic', lr.score(x_tr, y_tr), lr.score(x_te, y_te))

# # Decision
# dt = DecisionTreeClassifier(max_depth=5).fit(x_tr,y_tr)
# print('Decision', lr.score(x_tr, y_tr), dt.score(x_te, y_te))


K-NN 0.9920445505171042 0.9851851851851852
Logistic 1.0 0.9666666666666667
Decision 1.0 0.7203703703703703


In [54]:
# voting
from sklearn.ensemble import VotingClassifier

# knn, lr, dt
knn3 = KNeighborsClassifier(n_neighbors=5).fit(x_tr, y_tr)
knn5 = KNeighborsClassifier(n_neighbors=3).fit(x_tr, y_tr)
lr = LogisticRegression(max_iter=10000).fit(x_tr, y_tr)
dt3 = DecisionTreeClassifier(max_depth=3).fit(x_tr, y_tr)
dt5 = DecisionTreeClassifier(max_depth=5).fit(x_tr, y_tr)

hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)], voting = 'soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']

# for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
#     model.fit(x_tr, y_tr)
#     name = names[idx]
#     train_score = model.score(x_tr, y_tr) * 100
#     test_score = model.score(x_te, y_te) * 100
#     print(f'{name} Train Accuracy : {train_score:.2f}%')
#     print(f'{name} Test Accuracy : {test_score:.2f}%')    
#     print()

In [43]:
# Bagging
from sklearn.ensemble import RandomForestClassifier
rf3 = RandomForestClassifier(max_depth=3).fit(x_tr, y_tr)
rf5 = RandomForestClassifier(max_depth=5).fit(x_tr, y_tr)
# model.score(x_tr, y_tr), model.score(x_te, y_te)

In [44]:
# Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb1 = GradientBoostingClassifier(max_depth=1).fit(x_tr, y_tr)
gb3 = GradientBoostingClassifier(max_depth=3).fit(x_tr, y_tr)

# model.score(x_tr, y_tr), model.score(x_te, y_te)

In [46]:
# Stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
              ('gb', GradientBoostingClassifier())]

st = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression()).fit(x_tr, y_tr) # 최종모델(메타모델)

# model.score(x_tr, y_tr), model.score(x_te, y_te)



In [47]:
# 모델 성능 비교
model = [knn3,knn5,lr,dt3,dt5,hard,soft,rf3,rf5,gb1,gb3,st]
names = ['KNN-3','KNN-5','LogisticRegression','DecisionTree-3','DecisionTree-5','Ensemble-voting(hard)',
        'Ensemble-voting(soft)','Ensemble-bagging(randomforest-3)','Ensemble-bagging(randomforest-5)',
         'Ensemble-boosting(GradientBoosting-1)','Ensemble-boosting(GradientBoosting-3)','Ensemble-stacking(rf,gb)']

In [55]:
bs=0
for x,y in zip(model,names):
    print(f'{y}: {x.score(x_te,y_te)*100}') 
    if bs < x.score(x_te,y_te):
        bs = x.score(x_te,y_te)
        bn = y

print('------------------------------------------------------')
print(f'가장 좋은 모델은: {bn}')

TypeError: 'DecisionTreeClassifier' object is not iterable