In [2]:
# PCA（Principal Components Analysis）叫主成分分析技术，又称为主分量分析，旨在利用降维的思想，把多指标转换为少数几个综合指标
# 在统计学中，主成分分析PCA是一种简化数据集的技术
# 它是一个线性变换，这个变化把数据变换到一个新的坐标系统中，使得任何数据投影的第一大方差在第一个坐标（称第一主成分上），第二大方差在第二个坐标（第二主成分）上，以此类推。
# 主成分系经常用于减少数据集的维数，同时保持数据集对方差贡献最大的特征。

In [4]:
# 进行PCA降维前需要做数据标准化

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [8]:
np.random.seed(42)

In [10]:
df = pd.read_csv("./pca.csv")

In [12]:
df

Unnamed: 0,var1,var2,var3,class
0,5.248357,1.571557,0.120279,2
1,4.930868,1.612157,0.170031,0
2,5.323844,1.716610,0.200262,1
3,5.761515,1.710760,0.202349,2
4,4.882923,1.224466,0.177497,2
...,...,...,...,...
195,5.192659,1.406165,0.174499,1
196,4.558071,1.157373,0.186506,1
197,5.076863,1.770774,0.151062,0
198,5.029104,1.477092,0.177785,2


In [14]:
X = df.copy()

In [16]:
y = X.pop("class")

In [20]:
X

Unnamed: 0,var1,var2,var3
0,5.248357,1.571557,0.120279
1,4.930868,1.612157,0.170031
2,5.323844,1.716610,0.200262
3,5.761515,1.710760,0.202349
4,4.882923,1.224466,0.177497
...,...,...,...
195,5.192659,1.406165,0.174499
196,4.558071,1.157373,0.186506
197,5.076863,1.770774,0.151062
198,5.029104,1.477092,0.177785


In [22]:
y

0      2
1      0
2      1
3      2
4      2
      ..
195    1
196    1
197    0
198    2
199    0
Name: class, Length: 200, dtype: int64

In [24]:
scaler = StandardScaler()

In [26]:
X_std = scaler.fit_transform(X)

In [28]:
X_std

array([[ 5.78766514e-01,  2.76190961e-01, -1.52162464e+00],
       [-1.04981285e-01,  4.82377166e-01, -5.18095954e-01],
       [ 7.41336446e-01,  1.01284857e+00,  9.16730269e-02],
       [ 1.68390811e+00,  9.83139884e-01,  1.33765444e-01],
       [-2.08235092e-01, -1.48652939e+00, -3.67514555e-01],
       [-2.08217413e-01, -1.03977521e+00,  7.14540148e-01],
       [ 1.74440619e+00,  4.35909197e-01, -9.90329964e-01],
       [ 8.70279710e-01,  4.34640254e-01, -5.72076404e-02],
       [-4.61629869e-01,  4.35921811e-01,  2.07704993e-01],
       [ 6.28133586e-01,  3.82603986e+00,  6.05205583e-01],
       [-4.55107993e-01,  4.92641913e-01,  8.04061210e-01],
       [-4.57597631e-01,  1.06618796e+00, -1.04783735e+00],
       [ 3.04448485e-01,  8.81771756e-01, -1.46079739e+00],
       [-2.01632684e+00,  5.74407302e-01,  1.37494498e+00],
       [-1.81349730e+00, -4.07439208e-01,  4.21529387e-01],
       [-5.61571500e-01,  6.83675296e-01, -6.68477628e-01],
       [-1.04671897e+00, -8.72183278e-01

# P068 数据降维PCA-自己实现PCA降维算法

In [33]:
# 预计算：对输入X做标准化处理

# PCA计算步骤：
# 1.计算X_std的协方差矩阵
# 2.计算协方差矩阵的特征向量和对应的特征值
# 3.通过特征值的降序，对特征向量排序
# 4.确定降维后的数量2
# 5.取特征值最大的两个特征向量，创建参数矩阵
# 6.将X_std和W相乘，得到降维后的X_pca

In [35]:
X_std[:10]

array([[ 0.57876651,  0.27619096, -1.52162464],
       [-0.10498128,  0.48237717, -0.51809595],
       [ 0.74133645,  1.01284857,  0.09167303],
       [ 1.68390811,  0.98313988,  0.13376544],
       [-0.20823509, -1.48652939, -0.36751455],
       [-0.20821741, -1.03977521,  0.71454015],
       [ 1.74440619,  0.4359092 , -0.99032996],
       [ 0.87027971,  0.43464025, -0.05720764],
       [-0.46162987,  0.43592181,  0.20770499],
       [ 0.62813359,  3.82603986,  0.60520558]])

In [37]:
cov = np.cov(X_std, rowvar=False)

In [39]:
cov

array([[ 1.00502513,  0.0956248 , -0.1348713 ],
       [ 0.0956248 ,  1.00502513, -0.03278962],
       [-0.1348713 , -0.03278962,  1.00502513]])

In [41]:
eig_vals, eig_vecs = np.linalg.eig(cov)

In [43]:
eig_vals

array([1.18685219, 0.85399738, 0.9742258 ])

In [45]:
eig_vecs

array([[ 0.67226955,  0.73695513, -0.07036184],
       [ 0.45838603, -0.33974121,  0.82125401],
       [-0.58132254,  0.58435696,  0.56620761]])

In [49]:
eig_pairs = [
    (
        np.abs(eig_vals[i]),
        eig_vecs[:, i]
    )
    for i in range(len(eig_vals))
]

In [51]:
eig_pairs

[(1.186852192269717, array([ 0.67226955,  0.45838603, -0.58132254])),
 (0.8539973831410598, array([ 0.73695513, -0.33974121,  0.58435696])),
 (0.9742258014736445, array([-0.07036184,  0.82125401,  0.56620761]))]

In [53]:
eig_pairs.sort(reverse=True)

In [55]:
eig_pairs

[(1.186852192269717, array([ 0.67226955,  0.45838603, -0.58132254])),
 (0.9742258014736445, array([-0.07036184,  0.82125401,  0.56620761])),
 (0.8539973831410598, array([ 0.73695513, -0.33974121,  0.58435696]))]

In [57]:
W = np.hstack(
    (
        eig_pairs[0][1].reshape(3, 1),
        eig_pairs[1][1].reshape(3, 1),
    )
)

In [59]:
W

array([[ 0.67226955, -0.07036184],
       [ 0.45838603,  0.82125401],
       [-0.58132254,  0.56620761]])

In [61]:
X_pca = X_std.dot(W)

In [89]:
X_pca[:10]

array([[ 1.40024388, -0.67545559],
       [ 0.45172009,  0.11019099],
       [ 0.90936196,  0.83155012],
       [ 1.50493688,  0.76466371],
       [-0.60774993, -1.41425596],
       [-1.03197495, -0.43469094],
       [ 1.94822698, -0.32547982],
       [ 0.81755166,  0.26332417],
       [-0.23126283,  0.50808781],
       [ 1.82425867,  3.44062596]])

# P069 数据降维PCA-组合降维结果和标签数据

In [66]:
X_pca[:10]

array([[ 1.40024388, -0.67545559],
       [ 0.45172009,  0.11019099],
       [ 0.90936196,  0.83155012],
       [ 1.50493688,  0.76466371],
       [-0.60774993, -1.41425596],
       [-1.03197495, -0.43469094],
       [ 1.94822698, -0.32547982],
       [ 0.81755166,  0.26332417],
       [-0.23126283,  0.50808781],
       [ 1.82425867,  3.44062596]])

In [70]:
df_pca = pd.DataFrame(
    data=X_pca,
    columns = ["pca_1", "pca_2"]
)

In [72]:
df_pca.head()

Unnamed: 0,pca_1,pca_2
0,1.400244,-0.675456
1,0.45172,0.110191
2,0.909362,0.83155
3,1.504937,0.764664
4,-0.60775,-1.414256


In [74]:
df_pca["class"] = df["class"]

In [76]:
df_pca

Unnamed: 0,pca_1,pca_2,class
0,1.400244,-0.675456,2
1,0.451720,0.110191,0
2,0.909362,0.831550,1
3,1.504937,0.764664,2
4,-0.607750,-1.414256,2
...,...,...,...
195,0.298818,-0.737599,1
196,-1.339902,-1.541966,1
197,1.254769,0.532984,0
198,0.188605,-0.379464,2


# P070 数据降维PCA-使用sklearn的PCA算法

In [79]:
X_std[:10]

array([[ 0.57876651,  0.27619096, -1.52162464],
       [-0.10498128,  0.48237717, -0.51809595],
       [ 0.74133645,  1.01284857,  0.09167303],
       [ 1.68390811,  0.98313988,  0.13376544],
       [-0.20823509, -1.48652939, -0.36751455],
       [-0.20821741, -1.03977521,  0.71454015],
       [ 1.74440619,  0.4359092 , -0.99032996],
       [ 0.87027971,  0.43464025, -0.05720764],
       [-0.46162987,  0.43592181,  0.20770499],
       [ 0.62813359,  3.82603986,  0.60520558]])

In [81]:
from sklearn.decomposition import PCA

In [83]:
pca = PCA(n_components=2)

In [85]:
X_pca = pca.fit_transform(X_std)

In [91]:
X_pca[:10]

array([[ 1.40024388, -0.67545559],
       [ 0.45172009,  0.11019099],
       [ 0.90936196,  0.83155012],
       [ 1.50493688,  0.76466371],
       [-0.60774993, -1.41425596],
       [-1.03197495, -0.43469094],
       [ 1.94822698, -0.32547982],
       [ 0.81755166,  0.26332417],
       [-0.23126283,  0.50808781],
       [ 1.82425867,  3.44062596]])

In [93]:
df = pd.DataFrame(
    data = X_pca,
    columns = ["pca_1", "pca_2"]
)

In [95]:
df.head()

Unnamed: 0,pca_1,pca_2
0,1.400244,-0.675456
1,0.45172,0.110191
2,0.909362,0.83155
3,1.504937,0.764664
4,-0.60775,-1.414256


In [97]:
df["class"] = y

In [99]:
df 

Unnamed: 0,pca_1,pca_2,class
0,1.400244,-0.675456,2
1,0.451720,0.110191,0
2,0.909362,0.831550,1
3,1.504937,0.764664,2
4,-0.607750,-1.414256,2
...,...,...,...
195,0.298818,-0.737599,1
196,-1.339902,-1.541966,1
197,1.254769,0.532984,0
198,0.188605,-0.379464,2


# P071 数据降维PCA-计算观察方差分布

In [102]:
import numpy as np
import pandas as pd

In [104]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [106]:
df = pd.read_csv("./p071.csv")

In [108]:
df.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10
0,13.996714,2.178894,1.702786,17.270966,107,2.979189,2.888065,0.302612,1.142997,6.735694
1,13.361736,2.280392,2.200312,12.233504,131,3.278028,2.931631,0.053146,2.004682,4.243828
2,14.147689,2.541526,2.502622,17.608818,128,2.72905,4.057878,0.388645,3.250988,3.809796
3,15.02303,2.526901,2.52349,19.066914,106,3.319446,3.585941,0.070602,0.999803,2.575046
4,13.265847,1.311165,2.274967,16.240305,103,3.783654,2.959527,0.447615,1.458019,7.91591


In [110]:
data = df.values

In [116]:
data[:5]

array([[1.39967142e+01, 2.17889368e+00, 1.70278617e+00, 1.72709658e+01,
        1.07000000e+02, 2.97918925e+00, 2.88806490e+00, 3.02612243e-01,
        1.14299668e+00, 6.73569389e+00],
       [1.33617357e+01, 2.28039226e+00, 2.20031249e+00, 1.22335040e+01,
        1.31000000e+02, 3.27802832e+00, 2.93163073e+00, 5.31457084e-02,
        2.00468187e+00, 4.24382780e+00],
       [1.41476885e+01, 2.54152562e+00, 2.50262185e+00, 1.76088178e+01,
        1.28000000e+02, 2.72905044e+00, 4.05787800e+00, 3.88644820e-01,
        3.25098849e+00, 3.80979641e+00],
       [1.50230299e+01, 2.52690103e+00, 2.52349030e+00, 1.90669136e+01,
        1.06000000e+02, 3.31944552e+00, 3.58594100e+00, 7.06023660e-02,
        9.99802802e-01, 2.57504595e+00],
       [1.32658466e+01, 1.31116532e+00, 2.27496726e+00, 1.62403047e+01,
        1.03000000e+02, 3.78365393e+00, 2.95952694e+00, 4.47614611e-01,
        1.45801947e+00, 7.91591047e+00]])

In [120]:
scaler = StandardScaler()
data_std = scaler.fit_transform(data)

In [124]:
data_std[:5]

array([[ 0.57876651,  0.27619096, -1.52162464,  0.7355021 , -0.60421313,
        -0.1354866 , -0.52154334,  0.04309511, -1.08345453,  0.79149995],
       [-0.10498128,  0.48237717, -0.51809595, -0.91554733,  1.02512566,
         0.42202426, -0.43880483, -2.45542557,  0.00292335, -0.36992773],
       [ 0.74133645,  1.01284857,  0.09167303,  0.84623449,  0.82145831,
        -0.60214281,  1.70011979,  0.90475045,  1.5742164 , -0.57222434],
       [ 1.68390811,  0.98313988,  0.13376544,  1.32413156, -0.67210225,
         0.49929173,  0.80383555, -2.28058921, -1.26398758, -1.14772612],
       [-0.20823509, -1.48652939, -0.36751455,  0.39769855, -0.8757696 ,
         1.3653138 , -0.38582545,  1.49535971, -0.68628651,  1.34158416]])

In [126]:
pca = PCA(n_components=3)

In [128]:
data_pca = pca.fit_transform(data_std)

In [130]:
data_pca[:10]

array([[-0.18418816, -0.9995602 , -0.03114969],
       [ 0.47785544, -0.43106208, -1.12761124],
       [ 1.66201618,  0.79798703,  0.48929041],
       [-1.41856562, -0.39124297, -1.00334242],
       [-0.95127995, -0.61229063, -0.2061954 ],
       [-1.50575826, -0.54983598,  1.28598988],
       [ 1.22904261, -1.39777429,  1.67880378],
       [ 0.98559739,  0.24608349,  0.17627925],
       [ 0.51968301, -0.85100247,  0.41956888],
       [ 0.54877289, -1.07788715, -0.05099202]])

In [134]:
results = pd.DataFrame(
    data={"variance_ratio":pca.explained_variance_ratio_}, 
)

In [136]:
results["cumulative"] = results["variance_ratio"].cumsum()

In [138]:
results["component"] = results.index + 1

In [140]:
results

Unnamed: 0,variance_ratio,cumulative,component
0,0.138184,0.138184,1
1,0.127378,0.265561,2
2,0.122946,0.388508,3


# P072 数据降维PCA-指定方差百分比计算分量数

In [143]:
# 如果降维后，想保留95%的方差百分比
# 计算对应的降维后分量数目

In [147]:
data_std[:5]

array([[ 0.57876651,  0.27619096, -1.52162464,  0.7355021 , -0.60421313,
        -0.1354866 , -0.52154334,  0.04309511, -1.08345453,  0.79149995],
       [-0.10498128,  0.48237717, -0.51809595, -0.91554733,  1.02512566,
         0.42202426, -0.43880483, -2.45542557,  0.00292335, -0.36992773],
       [ 0.74133645,  1.01284857,  0.09167303,  0.84623449,  0.82145831,
        -0.60214281,  1.70011979,  0.90475045,  1.5742164 , -0.57222434],
       [ 1.68390811,  0.98313988,  0.13376544,  1.32413156, -0.67210225,
         0.49929173,  0.80383555, -2.28058921, -1.26398758, -1.14772612],
       [-0.20823509, -1.48652939, -0.36751455,  0.39769855, -0.8757696 ,
         1.3653138 , -0.38582545,  1.49535971, -0.68628651,  1.34158416]])

In [150]:
pca = PCA(n_components=0.95)
data_pca = pca.fit_transform(data_std)

In [152]:
pca.n_components_

10

In [154]:
data_pca[:10]

array([[-0.18418816, -0.9995602 , -0.03114969,  1.34588282, -0.83972814,
        -0.12250706, -0.76773933,  0.58219552, -0.76031385,  0.80819374],
       [ 0.47785544, -0.43106208, -1.12761124, -0.73556745,  0.89203511,
         0.91739116, -0.82250836, -1.61910294, -1.32494589,  0.14251192],
       [ 1.66201618,  0.79798703,  0.48929041,  0.97079235,  0.84888231,
         0.308894  ,  1.68651816,  0.00607098,  1.12247666,  0.59214204],
       [-1.41856562, -0.39124297, -1.00334242,  1.86692035,  0.94433735,
         2.03673892, -0.35153425, -1.36454481,  0.31130046,  1.12093176],
       [-0.95127995, -0.61229063, -0.2061954 ,  0.65643496, -1.30022257,
        -2.14229975, -0.37443913,  0.79811059,  0.38298958, -0.93598398],
       [-1.50575826, -0.54983598,  1.28598988,  0.03301075, -0.03296738,
        -0.9465066 ,  2.11224545,  0.37422059, -1.24686109, -0.94096253],
       [ 1.22904261, -1.39777429,  1.67880378,  0.53777728, -1.38523162,
         0.92460052,  0.35940551,  0.81609058