---
### 12.1.2 主成分分析的应用
#### 1. skleran.decomposition 模块的 PCA 函数
```python
sklearn.decomposition.PCA(n_components=None)
```

In [1]:
import numpy as np
from scipy.stats import zscore
from sklearn.decomposition import PCA

data = np.loadtxt('../../12第12章  主成分分析与因子分析/data12_1.txt')
B = zscore(data, ddof=1)
md = PCA().fit(B)
print("特征值：\n", md.explained_variance_, sep='')
print("各成分贡献率：\n", md.explained_variance_ratio_, sep='')

特征值：
[6.27943341e+00 1.30599080e+00 2.74277611e-01 9.97624700e-02
 2.31016993e-02 1.22233669e-02 4.69357480e-03 5.17066158e-04]
各成分贡献率：
[7.84929176e-01 1.63248850e-01 3.42847014e-02 1.24703088e-02
 2.88771241e-03 1.52792086e-03 5.86696851e-04 6.46332697e-05]


In [2]:
xs1 = md.components_
print("各主成分系数\n", xs1)

各主成分系数
 [[ 0.39186166  0.38439344  0.3059243   0.39231584  0.38537825  0.38961322
   0.38389911  0.05908768]
 [ 0.0210478  -0.02214509 -0.47783697  0.08905605  0.19878256  0.1043221
   0.11376475 -0.83634169]
 [-0.14762176 -0.39254883 -0.57745685  0.20021017  0.15914777  0.1347164
   0.4303164   0.47108809]
 [-0.47832186 -0.48941314  0.44488132  0.27708115 -0.02393188  0.46564673
  -0.07989617 -0.18222681]
 [-0.09762149 -0.27604773  0.34873203 -0.35508891  0.52307577 -0.45643408
   0.4158212  -0.10824944]
 [-0.70552867  0.53549607 -0.049896    0.32914209  0.14716623 -0.28505603
   0.02367193  0.03426265]
 [-0.08102349  0.10492282  0.11905975 -0.07206231 -0.68369188 -0.02060535
   0.68706375 -0.15212634]
 [ 0.28557308 -0.29011082  0.09180137  0.6940343  -0.15286467 -0.55886444
  -0.05098258 -0.07665729]]


#### 2. 主成分回归分析

In [3]:
data = np.loadtxt('../../12第12章  主成分分析与因子分析/data12_2.txt')
x = data[:, :-1]
y = data[:, -1]
B = zscore(data, ddof=1)
x_std = B[:, :-1]
md = PCA().fit(x_std)
print("特征值：", md.explained_variance_)
print("各主成分贡献率：", md.explained_variance_ratio_)
print("累积贡献率：", np.cumsum(md.explained_variance_ratio_))

特征值： [2.23570403e+00 1.57606607e+00 1.86606149e-01 1.62374573e-03]
各主成分贡献率： [5.58926009e-01 3.94016518e-01 4.66515373e-02 4.05936433e-04]
累积贡献率： [0.55892601 0.95294253 0.99959406 1.        ]


In [4]:
import statsmodels.formula.api as smf

n = 3   # 主成分个数
coef = md.components_[:n]
z = x_std @ (coef.T)
dic = {'x': z, 'y': y}
mod = smf.ols('y~x', dic).fit()
print(mod.summary())
print("\n残差方差：", mod.mse_resid)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     164.9
Date:                Sun, 28 Aug 2022   Prob (F-statistic):           3.50e-08
Time:                        16:01:27   Log-Likelihood:                -27.008
No. Observations:                  13   AIC:                             62.02
Df Residuals:                       9   BIC:                             64.28
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     95.4231      0.644    148.167      0.0



In [5]:
dic_ori = {'x': x, 'y': y}
mod_ori = smf.ols('y~x', dic_ori).fit()
print(mod_ori.summary())
print("\n残差方差：", mod_ori.mse_resid)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.974
Method:                 Least Squares   F-statistic:                     111.5
Date:                Sun, 28 Aug 2022   Prob (F-statistic):           4.76e-07
Time:                        16:01:27   Log-Likelihood:                -26.918
No. Observations:                  13   AIC:                             63.84
Df Residuals:                       8   BIC:                             66.66
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     62.4054     70.071      0.891      0.3



利用原始数据得到的回归模型所有系数都无法通过显著性检验

#### 3. 基于核主成分分析的高校科技创新能力评价
PCA 只能去除变量之间的线性相关信息，而忽略了它们之间的非线性相关问题。核主成分分析 (KPCA) 方法不仅适合处理非线性相关问题，而且能提供更多的特征信息

In [6]:
import pandas as pd
data = pd.read_excel('../../12第12章  主成分分析与因子分析/data12_3.xlsx', header=None).values
B = zscore(data, ddof=1, axis=0)
n = len(B)

md1 = PCA().fit(B)
print("PCA特征值：\n", md1.explained_variance_)
r1 = md1.explained_variance_ratio_
print("\n累积贡献率：\n", np.cumsum(r1))
n1 = 4      # 主成分个数
coef1 = md1.components_[:n1]
score1 = B @ coef1.T    # 计算每个主成分的得分（共4个）
grade1 = score1 @ r1[:n1]   # 计算总得分（乘以贡献率）
print("\n主成分评价得分：\n", grade1)
ind1 = np.argsort(-grade1)
ind11 = np.zeros(15)
ind11[ind1] = np.arange(1, 16)
print("\n主成分评价排名:\n", ind11)

PCA特征值：
 [7.50963582e+00 2.27720858e+00 1.47105704e+00 8.50786152e-01
 6.50545056e-01 5.06438457e-01 3.38461134e-01 2.41850500e-01
 7.58718928e-02 5.63367008e-02 1.62614416e-02 4.86075319e-03
 5.75813307e-04 1.10664593e-04]

累积贡献率：
 [0.53640256 0.69906031 0.80413582 0.86490626 0.91137376 0.94754794
 0.97172373 0.98899877 0.99441819 0.99844224 0.99960377 0.99995097
 0.9999921  1.        ]

主成分评价得分：
 [ 0.29285016  1.01659708 -1.22657941 -0.93647745 -1.64930206  0.03800517
  0.31471543  3.28572061 -1.12046447 -0.12595321  0.06706603  3.02572877
 -1.50884546 -1.21366531 -0.25939588]

主成分评价排名:
 [ 5.  3. 13. 10. 15.  7.  4.  1. 11.  8.  6.  2. 14. 12.  9.]


In [7]:
from sklearn.decomposition import KernelPCA

md2 = KernelPCA(kernel='poly', gamma=1, degree=3)
md2.fit_transform(B)
print("KPCA特征值：\n", md2.eigenvalues_)
r2 = (md2.eigenvalues_)/sum(md2.eigenvalues_)
print("\n累积贡献率：\n", np.cumsum(r2))
n2 = 2      # 主成分个数

KPCA特征值：
 [8.59350160e+04 5.18397183e+04 3.49538554e+03 2.68967243e+03
 2.07508131e+03 1.66682709e+03 1.40496329e+03 7.51371594e+02
 5.98056083e+02 3.82459237e+02 3.57692083e+02 2.30948199e+02
 1.12394486e+02 6.32737809e+01]

累积贡献率：
 [0.56684298 0.90878717 0.93184337 0.94958494 0.96327255 0.97426725
 0.98353464 0.98849082 0.99243571 0.99495848 0.99731788 0.99884126
 0.99958263 1.        ]


In [8]:
kpca = KernelPCA(kernel='poly', gamma=1, degree=3, n_components=n2)
data_reduced = kpca.fit_transform(B)
grade2 = data_reduced @ r2[:n2]
print("核主成分评价得分：\n", grade2)
ind2 = np.argsort(-grade2)
ind22 = np.zeros(15)
ind22[ind2] = np.arange(1, 16)
ind22
print("\n核主成分评价排名:\n", ind22)

核主成分评价得分：
 [-15.36545542  -9.01617549 -20.17554842 -17.75344706 -29.45292287
 -15.82920031 -14.80003324 132.0678934  -19.96494244 -16.1127206
 -15.80928431 107.2055594  -26.07868277 -22.29994938 -16.61509049]

核主成分评价排名:
 [ 5.  3. 12. 10. 15.  7.  4.  1. 11.  8.  6.  2. 14. 13.  9.]
