# 在Minist手写数字识别 ——PCA+LDA
图像数据维数高，而且特征之间（像素之间）相关性很高，因此我们用PCA对其降维，然后采用LADA分类器进行手写体数字分类

In [1]:
#导入必要的工具包
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
#读取训练数据和测试数据
train = pd.read_csv('./data/MNIST_train.csv')
test = pd.read_csv('./data/MNIST_test.csv')

y_train = train.label.values
X_train = train.drop("label",axis=1).values
X_test = test.values 

In [3]:
#将像素值[0,255]  --> [0,1]
X_train = X_train / 255.0
X_test = X_test / 255.0

In [4]:
# 原始输入的特征维数和样本数目
print('the shape of train_image: {}'.format(X_train.shape))
print('the shape of test_image: {}'.format(X_test.shape))

the shape of train_image: (42000, 784)
the shape of test_image: (28000, 784)


## PCA降维

In [6]:
pca = PCA(n_components=0.95, svd_solver = 'full')
pca.fit(X_train)
    
# 在训练集和测试集降维 
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [8]:
X_train_pca.shape

(42000, 154)

## LDA分类器

In [7]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_pca, y_train)
lda.predict(X_test_pca)

array([2, 0, 9, ..., 3, 9, 2])

In [16]:
# 用在降维后的全体训练数据集上训练的模型对测试集进行测试
y_predict = clf.predict(X_test_pca)

In [17]:
#生成提交测试结果
import pandas as pd
df = pd.DataFrame(y_predict)
df.columns=['Label']
df.index+=1
df.index.name = 'Imageid'
df.to_csv('SVC_Minist_submission.csv', header=True)

##  交叉验证, 评估模型性能

In [10]:
from sklearn.model_selection import cross_val_score
loss = cross_val_score(lda, X_train_pca, y_train, cv=5)
print ('accuracy of each fold is: ',loss)
print ('cv accuracy is:', loss.mean())



('accuracy of each fold is: ', array([0.8707912 , 0.86838034, 0.86510299, 0.86542813, 0.87255836]))
('cv accuracy is:', <built-in method mean of numpy.ndarray object at 0x1a0cbe5f30>)


In [12]:
loss.mean()

0.8684522029481091