<a href="https://colab.research.google.com/github/jellyho/AI_Learning_jellyho/blob/main/9_Ensemble_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

다수결 투표를 이용한 앙상블 학습

다수결 투표 클래스

In [18]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
  def __init__(self, classifiers, vote='classlabel', weights=None):
    self.classifiers = classifiers
    self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
    self.vote = vote
    self.weights = weights

  def fit(self, X, y):
    self.labelenc_ = LabelEncoder()
    self.labelenc_.fit(y)
    self.classes_ = self.labelenc_.classes_
    self.classifiers_ = []

    for clf in self.classifiers:
      fitted_clf = clone(clf).fit(X, self.labelenc_.transform(y))
      self.classifiers_.append(fitted_clf)
    
    return self

  def predict(self, X):
    if self.vote == 'probability':
      maj_vote = np.argmax(self.predict_proba(X), axis=1)
    
    else:
      predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
      maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions)
    
    maj_vote = self.labelenc_.inverse_transform(maj_vote)
    return maj_vote

  def decision_function(self, X):
    return self.predict(X)
    
  def predict_proba(self, X):
    probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
    avg_proba = np.average(probas, axis=0, weights=self.weights)

  def get_params(self, deep=True):
    if not deep:
      return super(MajorityVoteClassifier, self).get_params(deep=False)
    else:
      out = self.named_classifiers.copy()
      for name, step in six.iteritems(self.named_classifiers):
        for key, value in six.iteritems(step.get_params(deep=True)):
          out['%s__%s' % (name, key)] = value
      return out

붗꽃 데이터로 각 분류기별 스코어 확인

In [26]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

iris = datasets.load_iris()
X, y = iris.data[50:, [1,2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

clf1 = LogisticRegression(solver='liblinear', penalty='l2', C=0.001, random_state=10)
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=10)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

clf_labels = ['Logistic regression', 'Descision Tree', 'KNN']

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
  scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10)
  print(scores.mean(), scores.std(), label)

0.8400000000000001 0.19595917942265423 Logistic regression
0.86 0.2009975124224178 Descision Tree
0.8399999999999999 0.14966629547095767 KNN


앙상블 분류기를 이용했을 때... 정확도가 높아진 것을 볼 수 있다.

In [25]:
en_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
clf_labels = ['Logistic regression', 'Descision Tree', 'KNN', 'Majority voting']

all_clf = [pipe1, clf2, pipe3, en_clf]
for clf, label in zip(all_clf, clf_labels):
  scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10)
  print(scores.mean(), scores.std(), label)

0.8400000000000001 0.19595917942265423 Logistic regression
0.86 0.2009975124224178 Descision Tree
0.8399999999999999 0.14966629547095767 KNN
0.8800000000000001 0.18330302779823357 Majority voting


매개변수 튜닝을 하기 위해 각 객체의 매개변수에 접근해야함.