In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import IncrementalPCA
from tqdm import tqdm


In [4]:
# 文件列表（输入输出对应）
input_files = [
    ('weibo_qwen3_features_train.csv', 'weibo_qwen3_features_train_32.csv'),
    ('weibo_qwen3_features_predict.csv', 'weibo_qwen3_features_predict_32.csv')
]


In [5]:
# PCA参数
n_components = 32
chunksize = 100000
ipca = IncrementalPCA(n_components=n_components)


In [6]:
# 第一步：联合 partial_fit
print("📌 第一步：拟合 IncrementalPCA（考虑两个输入文件）")
for input_file, _ in input_files:
    reader = pd.read_csv(input_file, chunksize=chunksize)
    for chunk in tqdm(reader, desc=f"Fitting {input_file}"):
        features = chunk['content_feature'].str.strip().str.split(' ').apply(lambda x: list(map(float, x)))
        X = np.stack(features.values)
        ipca.partial_fit(X)

# 第二步：分别 transform 并写入各自输出文件
print("📌 第二步：对每个文件进行降维并保存")
for input_file, output_file in input_files:
    reader = pd.read_csv(input_file, chunksize=chunksize)
    first_chunk = True
    for chunk in tqdm(reader, desc=f"Transforming {input_file}"):
        uid = chunk['uid']
        mid = chunk['mid']
        features = chunk['content_feature'].str.strip().str.split(' ').apply(lambda x: list(map(float, x)))
        X = np.stack(features.values)
        X_reduced = ipca.transform(X)
        reduced_str = [' '.join(f'{num:.6f}' for num in row) for row in X_reduced]

        df_out = pd.DataFrame({
            'uid': uid,
            'mid': mid,
            'content_feature': reduced_str
        })

        df_out.to_csv(output_file, mode='w' if first_chunk else 'a', index=False, header=first_chunk)
        first_chunk = False

    print(f"✅ 已完成降维并保存：{output_file}")


📌 第一步：拟合 IncrementalPCA（考虑两个输入文件）


Fitting weibo_qwen3_features_train.csv: 13it [08:28, 39.10s/it]
Fitting weibo_qwen3_features_predict.csv: 2it [01:13, 36.63s/it]


📌 第二步：对每个文件进行降维并保存


Transforming weibo_qwen3_features_train.csv: 13it [07:34, 34.94s/it]


✅ 已完成降维并保存：weibo_qwen3_features_train_32.csv


Transforming weibo_qwen3_features_predict.csv: 2it [01:05, 32.58s/it]

✅ 已完成降维并保存：weibo_qwen3_features_predict_32.csv



