### 集成学习

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

##### 1. 读取数据

In [8]:
mnist_train = pd.read_csv("./data/mnist_train.zip", compression='zip')
mnist_test = pd.read_csv("./data/mnist_test.zip", compression='zip')

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### 2. 数据分析

In [16]:
mnist_train.info()
mnist_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 359.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 59.9 MB


Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,4.453933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200433,0.088867,0.045633,0.019283,0.015117,0.002,0.0,0.0,0.0,0.0
std,2.88927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.042472,3.956189,2.839845,1.68677,1.678283,0.3466,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


##### 3. 构建数据集

In [17]:
X_train = mnist_train.drop(columns=['label'], axis=1)
y_train = mnist_train['label']

X_test = mnist_test.drop(columns=['label'], axis=1)
y_test = mnist_test['label']

(      1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  1x10  ...  28x19  28x20  \
 0       0    0    0    0    0    0    0    0    0     0  ...      0      0   
 1       0    0    0    0    0    0    0    0    0     0  ...      0      0   
 2       0    0    0    0    0    0    0    0    0     0  ...      0      0   
 3       0    0    0    0    0    0    0    0    0     0  ...      0      0   
 4       0    0    0    0    0    0    0    0    0     0  ...      0      0   
 ...   ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...    ...    ...   
 9995    0    0    0    0    0    0    0    0    0     0  ...      0      0   
 9996    0    0    0    0    0    0    0    0    0     0  ...      0      0   
 9997    0    0    0    0    0    0    0    0    0     0  ...      0      0   
 9998    0    0    0    0    0    0    0    0    0     0  ...      0      0   
 9999    0    0    0    0    0    0    0    0    0     0  ...      0      0   
 
       28x21  28x22  28x23  28x24  28x25  28x26  2

##### 4. 构建模型

In [28]:
# 决策树
dtc_clf = DecisionTreeClassifier(min_samples_leaf=9, random_state=2021)
dtc_clf.fit(X_train, y_train)

print('DecisionTree train score: ', dtc_clf.score(X_train, y_train))
print('DecisionTree test score: ', dtc_clf.score(X_test, y_test))
y_pred = dtc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

DecisionTree train score:  0.92625
DecisionTree test score:  0.8785


0.8785

In [27]:
# 随机森林
rfc_clf = RandomForestClassifier()
rfc_clf.fit(X_train, y_train)

print('RandomForestClassifier train score: ', rfc_clf.score(X_train, y_train))
print('RandomForestClassifier test score: ', rfc_clf.score(X_test, y_test))
y_pred = rfc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

DecisionTree train score:  1.0
DecisionTree test score:  0.9693


0.9693

In [30]:
# GBDT
gbdt_clf = GradientBoostingClassifier()
gbdt_clf.fit(X_train, y_train)

print('GradientBoostingClassifier train score: ', gbdt_clf.score(X_train, y_train))
print('GradientBoostingClassifier test score: ', gbdt_clf.score(X_test, y_test))
y_pred = gbdt_clf.predict(X_test)
accuracy_score(y_test, y_pred)

KeyboardInterrupt: 

In [None]:
# Bagging
bc = BaggingClassifier()
bc.fit(X_train, y_train)

print('BaggingClassifier train score: ', bc.score(X_train, y_train))
print('BaggingClassifier test score: ', bc.score(X_test, y_test))
y_pred = bc.predict(X_test)
accuracy_score(y_test, y_pred)

In [31]:
# AdaBoost
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, y_train)

print('AdaBoostClassifier train score: ', ada_clf.score(X_train, y_train))
print('AdaBoostClassifier test score: ', ada_clf.score(X_test, y_test))
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

AdaBoostClassifier train score:  0.7284333333333334
AdaBoostClassifier test score:  0.7299


0.7299

In [None]:
# SVM
svc_clf = SVC()
svc_clf.fit(X_train, y_train)

print('SVC train score: ', svc_clf.score(X_train, y_train))
print('SVC test score: ', svc_clf.score(X_test, y_test))
y_pred = svc_clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# Voting 集成学习
voting_clf = VotingClassifier(
    estimators=[],
    voting='hard'
)

voting_clf.fit(X_train, y_train)
print('VotingClassifier train score: ', voting_clf.score(X_train, y_train))
print('VotingClassifier test score: ', voting_clf.score(X_test, y_test))
y_pred = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [20]:
DecisionTreeClassifier??