<a href="https://colab.research.google.com/github/hyogyeong/ESAA/blob/main/%EC%95%99%EC%83%81%EB%B8%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1-a. MNIST 데이터를 훈련, 검증, 테스트 데이터로 나눈다.

In [4]:
from sklearn.datasets import fetch_openml

In [6]:
import numpy as np
import pandas as pd

import os, struct
import requests, hashlib, gzip

def getMd5(data) :    
    hash = hashlib.new("md5")
    hash.update(data)
    return hash.hexdigest()

def fileDownload(url) :
    filename = getMd5(os.urandom(16))
    res = requests.get(url)
    res.raw.decode_content = True

    f = open(filename, 'wb')
    f.write(res.content)
    f.close()

    os.makedirs(filename + "_", exist_ok=True)

    with open(filename +"_/" + filename, "wb") as out_f, gzip.GzipFile(filename) as zip_f:
        out_f.write(zip_f.read())
    
    return filename

def removeFile(filename) : 
    os.remove(filename + "_/" + filename)
    os.rmdir(filename + "_")
    os.remove(filename)

def download_mnist(method):
    base_url = "https://ossci-datasets.s3.amazonaws.com/mnist/"
    download_link = {
        "train" : ('train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz'),
        "test" : ('t10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz')
    
    }
    images_path, labels_path = download_link[method]

    label_file_name = fileDownload(base_url + labels_path)

    with open(label_file_name + "_/" + label_file_name,'rb') as lbpath:
        magic, n = struct.unpack('>II',lbpath.read(8))
        labels = np.fromfile(lbpath, dtype=np.uint8)
    
    removeFile(label_file_name)

    image_file_name = fileDownload(base_url + images_path)

    with open(image_file_name + "_/" + image_file_name,'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII',imgpath.read(16))
        print(f"count of row = {num}, count of column = {rows * cols}")
        images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), rows * cols)
    
    removeFile(image_file_name)

    return images, labels


if __name__ == '__main__':
    X_train, y_train = download_mnist('train')

    x_df = pd.DataFrame(X_train)
    x_df.to_csv("train_dataset.csv", index=False)

    y_df = pd.DataFrame(y_train)
    y_df.to_csv("train_label.csv", index=False)

    X_test, y_test = download_mnist('test')

    x_df = pd.DataFrame(X_test)
    x_df.to_csv("test_dataset.csv", index=False)

    y_df = pd.DataFrame(y_test)
    y_df.to_csv("test_label.csv", index=False)

count of row = 60000, count of column = 784
count of row = 10000, count of column = 784


In [7]:
mnist = fetch_openml('mnist_784')



In [8]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=10000, random_state=42)

1-b. 랜덤 포레스트 분류기, 엑스트라 트리 분류기, SVM 분류기, MLP 분류기를 훈련시킨다. (n_estimators=100, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf =  ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [11]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
  print("Training the", estimator)
  estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(max_iter=100, random_state=42, tol=20)
Training the MLPClassifier(random_state=42)


In [12]:
[estimator.score(X_val, y_val) for estimator in estimators] # 선형 SVM이 다른 분류기보다 성능이 많이 떨어진다.

[0.9692, 0.9715, 0.859, 0.9639]

1-c. 이들을 직접 투표 분류기를 사용하는 앙상블로 연결한다.

In [13]:
from sklearn.ensemble import VotingClassifier

In [14]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

In [15]:
voting_clf = VotingClassifier(named_estimators)

In [16]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm_clf',
                              LinearSVC(max_iter=100, random_state=42, tol=20)),
                             ('mlp_clf', MLPClassifier(random_state=42))])

In [17]:
voting_clf.score(X_val, y_val)

0.9708

In [26]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.0, 0.0]

1-d. 다른 분류기보다 성능이 많이 떨어졌던 선형 SVM 제거 후 VotingClassifier를 다시 평가

In [20]:
voting_clf.set_params(svm_clf=None)
del voting_clf.estimators_[2]

In [21]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [22]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42)]

In [23]:
del voting_clf.estimators_[2]

IndexError: ignored

In [27]:
voting_clf.score(X_val, y_val)

0.9713

1-d. 간접 투표 분류기를 사용하여 앙상블로 연결.

In [28]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9719

In [29]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9648

In [31]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.0, 0.0]