In [2]:
# mnist data load
import numpy as np
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

In [4]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(mnist.data,mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=10000, random_state=42)
print(len(X_train))
print(len(X_test))
print(len(X_val))

50000
10000
10000


In [8]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

rnd_clf = RandomForestClassifier(n_estimators=100)
ext_clf = ExtraTreesClassifier(n_estimators=100)
lsv_clf = LinearSVC(max_iter=100, tol=20)


In [9]:
rnd_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)
lsv_clf.fit(X_train, y_train)

rnd_score = rnd_clf.score(X_val, y_val)
ext_score = ext_clf.score(X_val, y_val)
lsv_score = lsv_clf.score(X_val, y_val)

In [10]:
print(rnd_score)
print(ext_score)
print(lsv_score)


0.9678
0.9713
0.8747


In [17]:
# 투표기반 앙상블로 학습
from sklearn.ensemble import VotingClassifier
estimator_list = [
    ("random_forest_clf", rnd_clf),
    ("extra_trees_clf", ext_clf),
    ("svm_clf", lsv_clf)
]
voting_clf = VotingClassifier(estimator_list)
voting_clf.fit(X_train, y_train)
voting_score = voting_clf.score(X_val, y_val)
print(voting_score)

0.9693


In [18]:
# 간접 투표를 위해 svc 제거
del voting_clf.estimators_[2]

voting_clf.voting = "soft"
voting_score = voting_clf.score(X_val, y_val)
print(voting_score)

0.9726


In [19]:
# test

voting_clf.score(X_test, y_test)

0.968

In [20]:
# 기존 모델들 테스팅 결과
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.9656, 0.9679]

In [22]:
##########################
#examples 8
##########################

In [25]:
# 예측 하자
estimators = [rnd_clf, ext_clf, lsv_clf]
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [26]:
X_val_predictions

array([[5., 5., 5.],
       [8., 8., 8.],
       [2., 2., 2.],
       ...,
       [7., 7., 7.],
       [6., 6., 6.],
       [7., 7., 7.]], dtype=float32)

In [27]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)


RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [28]:
# 블렌더 평가하기.
rnd_forest_blender.oob_score_

0.9687

In [29]:
# 테스트 세트에 앙상블을 평가한다.
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [30]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
accuracy_score(y_test, y_pred)


0.9658