# 主成分分析与因子分析

本 Notebook 使用多元统计方法对指标体系进行降维处理，
以提取数据要素流动的核心综合因子。


In [3]:
import sys
import os

# 获取项目根目录（假设 notebooks 和 src 在同一层）
project_root = os.path.abspath("..")
sys.path.append(project_root)

print(project_root)

C:\Users\czh\gba_data_flow_project


In [26]:
import pandas as pd
from src.pca_fa import run_pca

df = pd.read_csv("../data/processed/gba_core_indicators_clean.csv")

pca_result = run_pca(df, n_components=3)

pca_result["explained_variance"]
pca_result["components"]
pca_result["scores"].head()

KeyError: 'explained_variance'

In [25]:
import numpy as np

weights = pca_result["explained_variance"]
scores = pca_result["scores"]

composite_index = np.dot(scores, weights)

df["composite_index"] = composite_index
df.sort_values("composite_index", ascending=False)


KeyError: 'explained_variance'

In [35]:
pca_result = run_pca(df, n_components=3)

pca_result["explained_variance_ratio"]


array([0.90415561, 0.07382468, 0.01806123])

In [10]:
weights = pca.explained_variance_ratio_[:3]
weights = weights / weights.sum()

df["composite_index"] = (
    weights[0] * df["PC1"]
  + weights[1] * df["PC2"]
  + weights[2] * df["PC3"]
)


NameError: name 'pca' is not defined

In [27]:
import pandas as pd
from src.pca_fa import run_pca

df = pd.read_csv("../data/processed/gba_core_indicators_clean.csv")

pca_result = run_pca(df, n_components=3)

pca_result["explained_variance_ratio"]

array([0.90415561, 0.07382468, 0.01806123])

In [28]:
pca_result["loadings"]


Unnamed: 0,PC1,PC2,PC3
跨境数据传输总量_TB,0.009906,-6.8e-05,0.016436
数据交易额_亿元,0.001616,0.004113,-0.002285
GDP_亿元,0.961013,-0.257871,-0.081497
人均GDP_万元,0.000223,-8.7e-05,-0.000236
第三产业占比_%,0.000426,-0.000427,0.000118
研发经费投入_亿元,0.052507,-0.01387,-0.021633
发明专利授权量,0.22497,0.930618,-0.288496
高新技术企业数,0.034385,0.039368,0.167217
互联网国际出口带宽_Gbps,0.147014,0.255797,0.937777
算力规模_PFLOPS,0.013471,0.016132,0.044374


In [29]:
pca_result["scores"].head()


Unnamed: 0,PC1,PC2,PC3
0,-4407.274258,-86.937392,-105.15779
1,-2374.187695,2952.940443,-429.569126
2,842.628672,7782.871852,1620.783625
3,1011.706989,-565.036534,2307.987299
4,6565.852261,12074.195601,2444.833189


In [30]:
# 取 PCA 得分
scores = pca_result["scores"].copy()

# 方差解释率
weights = pca_result["explained_variance_ratio"]

# 计算综合指数
scores["CI"] = (
    weights[0] * scores["PC1"]
    + weights[1] * scores["PC2"]
    + weights[2] * scores["PC3"]
)

scores.head()


Unnamed: 0,PC1,PC2,PC3,CI
0,-4407.274258,-86.937392,-105.15779,-3993.179127
1,-2374.187695,2952.940443,-429.569126,-1936.393776
2,842.628672,7782.871852,1620.783625,1365.708808
3,1011.706989,-565.036534,2307.987299,914.711997
4,6565.852261,12074.195601,2444.833189,6872.082448


In [31]:
result = pd.concat(
    [df.iloc[:, :2].reset_index(drop=True), scores],
    axis=1
)

result.head()


Unnamed: 0,城市,年份,PC1,PC2,PC3,CI
0,东莞,2019,-4407.274258,-86.937392,-105.15779,-3993.179127
1,东莞,2020,-2374.187695,2952.940443,-429.569126,-1936.393776
2,东莞,2021,842.628672,7782.871852,1620.783625,1365.708808
3,东莞,2022,1011.706989,-565.036534,2307.987299,914.711997
4,东莞,2023,6565.852261,12074.195601,2444.833189,6872.082448


In [32]:
result["排名"] = (
    result.groupby("年份")["CI"]
    .rank(ascending=False, method="min")
    .astype(int)
)

result.sort_values(["年份", "排名"]).head(15)


Unnamed: 0,城市,年份,PC1,PC2,PC3,CI,排名
30,深圳,2019,17150.867107,-294.232201,1271.535013,15508.296516,1
50,香港,2019,12281.368187,-1461.652593,-1083.147003,10976.79888,2
15,广州,2019,11367.524217,587.735986,-3106.364851,10265.295381,3
0,东莞,2019,-4407.274258,-86.937392,-105.15779,-3993.179127,4
10,佛山,2019,-4858.329956,-7086.567899,-1105.13168,-4935.809906,5
5,中山,2019,-10517.29938,7930.075301,-2447.950441,-8968.052913,6
20,惠州,2019,-10207.771898,-830.023611,-174.557132,-9293.84312,7
35,澳门,2019,-11661.647506,-1273.197758,-51.310281,-10638.8641,8
40,珠海,2019,-12709.975963,-2275.769937,2974.587663,-11606.079276,9
45,肇庆,2019,-13654.342651,-2741.488834,1949.648294,-12512.826927,10


In [33]:
import os
os.makedirs("../outputs/tables", exist_ok=True)

result.to_excel(
    "../outputs/tables/综合指数_城市排名.xlsx",
    index=False
)

result.to_csv(
    "../outputs/tables/综合指数_城市排名.csv",
    index=False,
    encoding="utf-8-sig"
)


In [34]:
from docx import Document

doc = Document()
doc.add_heading("粤港澳大湾区数据要素流动综合评价报告", level=1)

doc.add_heading("一、研究对象与数据说明", level=2)
doc.add_paragraph(
    "本文以粤港澳大湾区11个城市为研究对象，选取2019—2023年数据，"
    "共形成55个样本，构建区域数据要素流动与数字经济发展水平的综合评价体系。"
)

doc.add_heading("二、指标体系", level=2)
doc.add_paragraph(
    "研究共选取11项指标，涵盖经济发展水平、创新能力、数字基础设施与数据流动规模。"
    "为消除量纲差异，所有指标均进行标准化处理。"
)

doc.add_heading("三、研究方法", level=2)
doc.add_paragraph(
    "采用主成分分析（PCA）方法对指标进行降维处理，"
    "通过提取主要主成分构建综合评价指数。"
)

doc.add_heading("四、主成分分析结果", level=2)
doc.add_paragraph(
    "结果显示，前三个主成分的方差解释率分别为："
    "PC1=90.42%，PC2=7.38%，PC3=1.81%，累计解释率达99.63%，"
    "说明所提取主成分能够较好反映原始指标信息。"
)

doc.add_heading("五、综合指数构建", level=2)
doc.add_paragraph(
    "根据各主成分的方差解释率，构建综合评价指数（CI）：\n"
    "CI = 0.9042×PC1 + 0.0738×PC2 + 0.0181×PC3。"
)

doc.add_heading("六、综合评价与排名结果", level=2)
doc.add_paragraph(
    "基于综合指数，对粤港澳大湾区各城市在不同年份的数据要素流动水平进行排序。"
    "结果表明，不同城市在数据要素配置与数字经济发展方面存在显著差异。"
)

doc.add_heading("七、结论", level=2)
doc.add_paragraph(
    "本文通过构建综合评价指数，对粤港澳大湾区数据要素流动水平进行了系统分析，"
    "为区域数字经济协同发展提供了定量参考。"
)

doc.save("../outputs/粤港澳大湾区数据要素流动综合评价报告.docx")


ModuleNotFoundError: No module named 'docx'