In [None]:
import numpy as np
import pandas as pd
from scipy.linalg import svd

# 参考：PCA的数学原理
https://zhuanlan.zhihu.com/p/21580949

In [5]:
# 默认 一行 为一个sample。 每列为一个属性
X = np.array([[1, -1,1], [2, 1,2], [-3, 2,1], [1, 1,2], [2, 1,3], [3, 2,2]])
print(X.shape)
print(X)

XC = X - np.mean(X, axis=0,keepdims=True)
XC

(6, 3)
[[ 1 -1  1]
 [ 2  1  2]
 [-3  2  1]
 [ 1  1  2]
 [ 2  1  3]
 [ 3  2  2]]


array([[ 0.        , -2.        , -0.83333333],
       [ 1.        ,  0.        ,  0.16666667],
       [-4.        ,  1.        , -0.83333333],
       [ 0.        ,  0.        ,  0.16666667],
       [ 1.        ,  0.        ,  1.16666667],
       [ 2.        ,  1.        ,  0.16666667]])

# PCA 计算协方差矩阵时会减去均值(协方差的定义) 

In [6]:
import numpy as np
from sklearn.decomposition import PCA

# PCA 计算协方差矩阵时会减去均值(协方差的定义)。 
#代码见376-378行(减均值+SVD)： https://github.com/scikit-learn/scikit-learn/blob/412996f/sklearn/decomposition/pca.py#L105
pca = PCA(n_components=2)
print(pca.fit_transform(X))

print('center:')
print(pca.fit_transform(XC))


[[-3.89363683e-03  2.15908343e+00]
 [-1.00652522e+00 -7.41792536e-02]
 [ 4.16259569e+00 -6.02573962e-01]
 [-3.84422851e-02 -5.15002105e-02]
 [-1.23717893e+00 -3.83180516e-01]
 [-1.87655562e+00 -1.04764948e+00]]
center:
[[-3.89363683e-03  2.15908343e+00]
 [-1.00652522e+00 -7.41792536e-02]
 [ 4.16259569e+00 -6.02573962e-01]
 [-3.84422851e-02 -5.15002105e-02]
 [-1.23717893e+00 -3.83180516e-01]
 [-1.87655562e+00 -1.04764948e+00]]


# X减去均值后进行SVD,结果才和PCA一样

In [70]:
# Truncte SVD
from sklearn.decomposition import TruncatedSVD
# tsvd 使用svd 来实现：https://github.com/scikit-learn/scikit-learn/blob/412996f/sklearn/decomposition/truncated_svd.py#L25

tsvd = TruncatedSVD(2)
print(tsvd.fit_transform(X))
print('center:')
print(tsvd.fit_transform(XC))

[[ 1.02138507 -0.92248091]
 [ 2.99927574  0.06586869]
 [-0.74717548  3.66377642]
 [ 2.3180092   0.73472539]
 [ 3.65822947  0.44918799]
 [ 3.99937748  0.03395549]]
center:
[[-3.89363683e-03  2.15908343e+00]
 [-1.00652522e+00 -7.41792536e-02]
 [ 4.16259569e+00 -6.02573962e-01]
 [-3.84422851e-02 -5.15002105e-02]
 [-1.23717893e+00 -3.83180516e-01]
 [-1.87655562e+00 -1.04764948e+00]]


# TruncatedSVD 可由SVD 得到。 取前n个奇异值对应的奇异向量

In [71]:
# SVD
from scipy.linalg import svd
def hand_tsvd(X):
    U, S, V = svd(X, full_matrices=False)
    print(U.shape, S.shape, V.shape)

    U2=U[:,:2]
    S2=np.diag(S[:2])
    V2=V[:2,:]
    print(np.dot(U2 , S2))

hand_tsvd(X)
print('center:')
hand_tsvd(XC)

(6, 3) (3,) (3, 3)
[[-1.02138507 -0.92248091]
 [-2.99927574  0.06586869]
 [ 0.74717548  3.66377642]
 [-2.3180092   0.73472539]
 [-3.65822947  0.44918799]
 [-3.99937748  0.03395549]]
center:
(6, 3) (3,) (3, 3)
[[-3.89363683e-03 -2.15908343e+00]
 [-1.00652522e+00  7.41792536e-02]
 [ 4.16259569e+00  6.02573962e-01]
 [-3.84422851e-02  5.15002105e-02]
 [-1.23717893e+00  3.83180516e-01]
 [-1.87655562e+00  1.04764948e+00]]
