In [1]:
import numpy as np
from sklearn.decomposition import PCA

# ===============================
# 1. 读取原始 genus-level embedding
# ===============================

data = np.load("genus_embeddings.npz", allow_pickle=True)

genus = data["genus"]          # (G,)
embeddings = data["embeddings"]  # (G, 768)
counts = data["counts"]        # (G,)

G, D = embeddings.shape
print(f"原始 embedding 形状: {embeddings.shape}  (Genus 数量 G={G}, 维度 D={D})")

# ===============================
# 2. PCA 降维：768 -> 256
# ===============================

TARGET_DIM = 256

pca = PCA(n_components=TARGET_DIM, random_state=42)
embeddings_256 = pca.fit_transform(embeddings)   # (G, 256)

explained = pca.explained_variance_ratio_
print(f"\n前 {TARGET_DIM} 个主成分解释的总方差比例: {explained.sum():.4f}")
print("前 10 个主成分的方差比例:", explained[:10])

print(f"\n降维后的 embedding 形状: {embeddings_256.shape}")

# ===============================
# 3. 保存新的 256 维 embedding
# ===============================

np.savez_compressed(
    "genus_embeddings_256.npz",
    genus=genus,                   # 保持原来的顺序
    embeddings=embeddings_256,     # (G, 256)
    counts=counts,                 # 仍然保存每个 genus 使用了多少条序列
    explained_variance_ratio=explained  # 方便以后查看 PCA 信息
)

print("\n✅ 已保存降维后的文件: genus_embeddings_256.npz")


原始 embedding 形状: (1117, 768)  (Genus 数量 G=1117, 维度 D=768)

前 256 个主成分解释的总方差比例: 0.9858
前 10 个主成分的方差比例: [0.22598    0.10179504 0.07121984 0.0458119  0.04288924 0.03472724
 0.0321954  0.02696875 0.02554932 0.02055333]

降维后的 embedding 形状: (1117, 256)

✅ 已保存降维后的文件: genus_embeddings_256.npz
