### 集成学习

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

##### 1. 读取数据

In [3]:
mnist_train = pd.read_csv("./data/mnist_train.zip", compression='zip')
mnist_test = pd.read_csv("./data/mnist_test.zip", compression='zip')

##### 2. 数据分析

In [4]:
mnist_train.info()
mnist_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 359.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 59.9 MB


##### 3. 构建数据集

In [5]:
X_train = mnist_train.drop(columns=['label'], axis=1)
y_train = mnist_train['label']

X_test = mnist_test.drop(columns=['label'], axis=1)
y_test = mnist_test['label']

##### 4. 构建模型

In [6]:
# 决策树
dtc_clf = DecisionTreeClassifier(min_samples_leaf=9, random_state=2021)
dtc_clf.fit(X_train, y_train)

print('DecisionTree train score: ', dtc_clf.score(X_train, y_train))
print('DecisionTree test score: ', dtc_clf.score(X_test, y_test))
y_pred = dtc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

DecisionTree train score:  0.92625
DecisionTree test score:  0.8785


0.8785

In [7]:
# 随机森林
rfc_clf = RandomForestClassifier(n_estimators=10)
rfc_clf.fit(X_train, y_train)

print('RandomForestClassifier train score: ', rfc_clf.score(X_train, y_train))
print('RandomForestClassifier test score: ', rfc_clf.score(X_test, y_test))
y_pred = rfc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

RandomForestClassifier train score:  0.9991166666666667
RandomForestClassifier test score:  0.9498


0.9498

In [8]:
# GBDT
gbdt_clf = GradientBoostingClassifier(n_estimators=30)
gbdt_clf.fit(X_train, y_train)

print('GradientBoostingClassifier train score: ', gbdt_clf.score(X_train, y_train))
print('GradientBoostingClassifier test score: ', gbdt_clf.score(X_test, y_test))
y_pred = gbdt_clf.predict(X_test)
accuracy_score(y_test, y_pred)

GradientBoostingClassifier train score:  0.9074333333333333
GradientBoostingClassifier test score:  0.9057


0.9057

In [9]:
# Bagging
# 以决策树为基学习器
bc = BaggingClassifier(
    DecisionTreeClassifier(),
    max_samples=0.5,
    max_features=1.0,
    n_estimators=10)
bc.fit(X_train, y_train)

print('BaggingClassifier train score: ', bc.score(X_train, y_train))
print('BaggingClassifier test score: ', bc.score(X_test, y_test))
y_pred = bc.predict(X_test)
print('基于决策树的bagging acc: ', accuracy_score(y_test, y_pred))

# 以逻辑回归为基学习器
bgc = BaggingClassifier(
    LogisticRegression(max_iter=500),
    max_samples=0.5,
    max_features=1.0,
    n_estimators=20)
bgc.fit(X_train, y_train)

print('BaggingClassifier train score: ', bgc.score(X_train, y_train))
print('BaggingClassifier test score: ', bgc.score(X_test, y_test))
y_pred = bgc.predict(X_test)
print('基于逻辑回归的bagging acc: ', accuracy_score(y_test, y_pred))

BaggingClassifier train score:  0.9882833333333333
BaggingClassifier test score:  0.9429
基于决策树的bagging acc:  0.9429
BaggingClassifier train score:  0.9426
BaggingClassifier test score:  0.9245
基于逻辑回归的bagging acc:  0.9245


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
# AdaBoost
# 以决策树为基学习器
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(),
    n_estimators=10,
    learning_rate=0.01)
ada_clf.fit(X_train, y_train)

print('AdaBoostClassifier train score: ', ada_clf.score(X_train, y_train))
print('AdaBoostClassifier test score: ', ada_clf.score(X_test, y_test))
y_pred = ada_clf.predict(X_test)
print('基于决策树的adaboost acc: ', accuracy_score(y_test, y_pred))

# 以逻辑回归为基学习器
ada_clf = AdaBoostClassifier(
    LogisticRegression(),
    n_estimators=20,
    learning_rate=0.01)
ada_clf.fit(X_train, y_train)

print('AdaBoostClassifier train score: ', ada_clf.score(X_train, y_train))
print('AdaBoostClassifier test score: ', ada_clf.score(X_test, y_test))
y_pred = ada_clf.predict(X_test)
print('基于逻辑回归的adaboost acc: ', accuracy_score(y_test, y_pred))

In [None]:
# SVM
svc_clf = SVC()
svc_clf.fit(X_train, y_train)

print('SVC train score: ', svc_clf.score(X_train, y_train))
print('SVC test score: ', svc_clf.score(X_test, y_test))
y_pred = svc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# Voting 集成学习
voting_clf = VotingClassifier(
    estimators=[],
    voting='hard'
)

voting_clf.fit(X_train, y_train)
print('VotingClassifier train score: ', voting_clf.score(X_train, y_train))
print('VotingClassifier test score: ', voting_clf.score(X_test, y_test))
y_pred = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred)
