# PCA
## 简介
数据的最大方差给出了数据变化最大的维度，通常表示数据最重要的信息。
## 在Numpy中实现PCA
伪代码：
```
去除平均值
计算协方差矩阵
计算协方差矩阵的特征值和特征向量
将特征值从大到小排列
保留最上面的N个特征向量
将数据转换到上述N个特征向量构建的新空间中
```

In [1]:
import numpy as np

In [16]:
def load_dataset(file_name, delimiter='\t'):
    with open(file_name) as f:
        line_list = [line.strip().split(delimiter) for line in f.readlines()]
        dataset = [list(map(float, line)) for line in line_list]
    return np.mat(dataset)

In [35]:
def pca(dataset, top_n_feature):
    mean_ = np.mean(dataset, axis=0)
    dataset = dataset - mean_  # remove mean
    cov_matrix = np.cov(dataset, rowvar=0)
    eig_values, eig_vectors = np.linalg.eig(np.mat(cov_matrix))
    eig_index = np.argsort(eig_values)  # sort, sort goes smallest to largest
    print(eig_index)
    eig_index = eig_index[-(top_n_feature)]  # cut off unwanted dimensions
    principal = eig_vectors[:, eig_index]
    result = dataset * principal  # transform data into new dimensions
    return result 

In [17]:
dataset = load_dataset('pca-dataset.txt')

In [18]:
dataset

matrix([[ 10.235186,  11.321997],
        [ 10.122339,  11.810993],
        [  9.190236,   8.904943],
        ..., 
        [  9.854922,   9.201393],
        [  9.11458 ,   9.134215],
        [ 10.334899,   8.543604]])

In [36]:
result = pca(dataset, 1)

[0 1]


In [37]:
result

matrix([[ -2.51033597e+00],
        [ -2.86915379e+00],
        [  9.74108510e-02],
        [ -7.67782222e-01],
        [  1.02715333e+00],
        [ -1.44409178e+00],
        [ -2.17360352e+00],
        [ -7.73998803e-01],
        [ -1.09983463e+00],
        [ -1.70275987e+00],
        [ -5.39605615e-01],
        [ -9.15572638e-01],
        [ -2.42669452e+00],
        [  2.93613464e+00],
        [  3.92702506e-01],
        [  2.59205734e+00],
        [  4.96435625e-01],
        [ -1.52069930e+00],
        [  1.25471104e-01],
        [  1.41846162e+00],
        [  5.89902904e-01],
        [  6.69601540e-01],
        [  3.25466068e-01],
        [  1.18812333e+00],
        [  2.07790920e+00],
        [  3.24938650e-01],
        [  3.41109884e+00],
        [  1.92249899e+00],
        [ -6.57788287e-01],
        [ -3.56836313e-01],
        [  5.72240108e-02],
        [  1.13444036e+00],
        [ -2.47196176e-01],
        [  1.14678135e+00],
        [  1.62503940e+00],
        [ -2.1148913