## Voting

In [2]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline

In [3]:
from sklearn import datasets
dataset = datasets.load_digits() #숫자 데이터
#(1797,64) 데이터의 갯수, 데이터의 크기
x_data = dataset.data
y_data = dataset.target
#splot = (7:2:1) #비율로 나눔
x_train_val, x_test,y_train_val,y_test = train_test_split(x_data,y_data, test_size = 180, random_state = 42)
x_train, x_val,y_train, y_val = train_test_split(x_train_val,y_train_val,test_size = 360,random_state = 42)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(1257, 64)
(360, 64)
(180, 64)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest_clf = RandomForestClassifier(n_estimators = 100,random_state = 42)
svm_clf = LinearSVC(max_iter = 100,tol=20, random_state = 42)
mlp_clf = MLPClassifier(random_state = 42)

In [5]:
estimators = [random_forest_clf,svm_clf,mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(x_train,y_train)
    
[estimator.score(x_val,y_val) for estimator in estimators]

Training the RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Training the LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100,
          multi_class='ovr', penalty='l2', random_state=42, tol=20, verbose=0)
Training the MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=

[0.9638888888888889, 0.8805555555555555, 0.9555555555555556]

In [6]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf",random_forest_clf),
    ("svm_clf",svm_clf),
    ("mlp_clf",mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(x_train,y_train)
print(voting_clf.estimators_)
#voting_clf.voting = "hard" - default 값
print(voting_clf.score(x_val,y_val))

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100,
          multi_class='ovr', penalty='l2', random_state=42, tol=20, verbose=0), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_i

In [7]:
del voting_clf.estimators_[1] #Linear SVC가 확률값을 반환하지않고 결과값만 반환하기 때문에 뺌
voting_clf.voting = "soft"
voting_clf.score(x_val,y_val)

0.9611111111111111

In [8]:
print(voting_clf.score(x_test,y_test))

0.9777777777777777


# Stacking

In [12]:
x_val_predictions = np.empty((len(x_val),len(estimators)),dtype = np.float32)

for index, estimator in enumerate(estimators):
    x_val_predictions[:,index] = estimator.predict(x_val)
    
x_val_predictions # prediction 모아둠

array([[1., 1., 1.],
       [4., 9., 7.],
       [3., 3., 3.],
       ...,
       [9., 9., 9.],
       [4., 4., 4.],
       [6., 6., 6.]], dtype=float32)

In [13]:
rnd_forest_blender = RandomForestClassifier(n_estimators =100, oob_score = True,random_state = 42)
rnd_forest_blender.fit(x_val_predictions,y_val)
rnd_forest_blender.oob_score_

0.9638888888888889

In [15]:
x_test_predictions = np.empty((len(x_test),len(estimators)),dtype = np.float32)

for index,estimators in enumerate(estimators):
    x_test_predictions[:,index] = estimator.predict(x_test)
    
y_pred = rnd_forest_blender.predict(x_test_predictions)#최종 블랜딩 값
accuracy_score(y_test,y_pred)

0.9777777777777777