# 根据bci2a数据集中的元信息，拿到能用于第一次clip实验的句向量
## 代码来自metaData_Qw3_0.6B.ipynb
## 11.2 第一次实验

## 1.加载模型与基础配置

In [1]:
#基本 GPU 信息
import torch, os, subprocess, sys
print("Torch:", torch.__version__, "| CUDA 可用:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    free, total = torch.cuda.mem_get_info()
    print(f"显存: 可用 {free/1024**3:.2f} GB / 总计 {total/1024**3:.2f} GB")

Torch: 2.8.0+cu128 | CUDA 可用: True
GPU: NVIDIA A100 80GB PCIe
显存: 可用 78.84 GB / 总计 79.25 GB


In [2]:
# 导入常用库
import os
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import random

# 固定随机种子，保证 t-SNE 可视化尽量可复现
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 选择设备（有 GPU 就用 GPU）
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cuda'

## 2.准备 BCI IV-2a 的四类英文描述（带指令）

In [3]:
import torch
import numpy as np
import pickle
import os
import pandas as pd

# ================= 1. 实验设置 =================

# 统一的英文指令（保持不变）
INSTRUCT = (
    "The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. "
    "Produce an embedding for retrieval with a strong emphasis on LATERALITY: "
    "if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; "
    "if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. "
    "Do not include any movement for the non-mentioned hand."
)

# --- 受试者特定信息 (9个受试者) ---
# A01T 到 A09T 的年龄
ages_subject = [22, 24, 26, 24, 24, 23, 25, 23, 27] # 注意：最后一个受试者A09的年龄是27岁
# A01T 到 A09T 的性别 (0=male, 1=female)
gender_subject = [1, 1, 0, 1, 0, 1, 0, 0, 0] 
# 性别数字到文本的映射
GENDERS_MAP = ["male ", "female "] 

# --- 动作模板 ---
TEMPLATES = [
    "imagined movement of the {label}",   # 动作句式1
    "motor imagery of the {label}",       # 动作句式2
]

# --- 标签映射 ---
label_text = {
    "left_hand":  "left hand",
    "right_hand": "right hand",
    "feet":       "both feet",
    "tongue":     "tongue",
}

# ================== 2. 生成 Prompt ==================

# 预备四个并行列表
all_texts, all_labels, all_subjects = [], [], []

# 遍历9个受试者
# (使用 zip 将年龄和性别列表打包在一起)
for i, (age, gender_idx) in enumerate(zip(ages_subject, gender_subject)):
    subject_id = i + 1  # 受试者编号 (1 到 9)
    gender_str = GENDERS_MAP[gender_idx] # 将 0/1 转换成 "male "/"female "
    
    # 遍历4个动作类别
    for cls, phrase in label_text.items():
        
        # 遍历2个动作模板
        for tpl in TEMPLATES:
            # 构造一条完整句子: A {age}-year-old {gender}subject, + {动作模块}
            sent = f"A {age}-year-old {gender_str}subject, " + tpl.format(label=phrase)
            
            # 加上统一指令
            final_prompt = INSTRUCT + " " + sent
            
            # --- 收集结果 ---
            all_texts.append(final_prompt)   # 存储最终的Prompt
            all_labels.append(cls)           # 存储对应的类别标签
            all_subjects.append(subject_id)  # 存储对应的受试者ID

# ================== 3. 结果检查 ==================

# 总Prompt数 = 9个受试者 × 4个类别 × 2个模板 = 72
print(f"--- 实验设置 ---")
print(f"受试者数量: {len(ages_subject)}")
print(f"动作类别数量: {len(label_text)}")
print(f"模板数量: {len(TEMPLATES)}")
print(f"--- 总计 ---")
print(f"总计生成的Prompt数量: {len(all_texts)} (预期: {len(ages_subject) * len(label_text) * len(TEMPLATES)})")
print(f"对应的标签数量: {len(all_labels)}")
print(f"对应的受试者ID数量: {len(all_subjects)}\n")


print("--- 示例: 受试者1 (A01) 的前4条Prompts ---")
# 受试者1有 4(类别) * 2(模板) = 8 条prompt，我们只看前4条
for i in range(4):
    print(f"  [受试者 {all_subjects[i]}, 类别: {all_labels[i]}]")
    print(f"  Prompt: {all_texts[i]}\n")

print("--- 示例: 受试者3 (A03) 的第一条Prompt ---")
# 受试者1有8条, 受试者2有8条, 所以受试者3从索引16开始
idx_subject_3 = (3 - 1) * len(label_text) * len(TEMPLATES)
print(f"  [受试者 {all_subjects[idx_subject_3]}, 类别: {all_labels[idx_subject_3]}]")
print(f"  Prompt: {all_texts[idx_subject_3]}\n")



--- 实验设置 ---
受试者数量: 9
动作类别数量: 4
模板数量: 2
--- 总计 ---
总计生成的Prompt数量: 72 (预期: 72)
对应的标签数量: 72
对应的受试者ID数量: 72

--- 示例: 受试者1 (A01) 的前4条Prompts ---
  [受试者 1, 类别: left_hand]
  Prompt: The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 22-year-old female subject, imagined movement of the left hand

  [受试者 1, 类别: left_hand]
  Prompt: The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand invo

## 3.加载 Qwen3-Embedding-0.6B 并批量编码

In [4]:
# ================== 加载模型（保持不变） ==================
# 加载 Qwen3-Embedding-0.6B
model_name = "Qwen/Qwen3-Embedding-0.6B"

# 说明：
# - SentenceTransformer 会自动处理 tokenizer + pooling，并支持 normalize_embeddings=True
# - 需较新版本的 transformers>=4.51.0、sentence-transformers>=2.7.0
model = SentenceTransformer(model_name, device=DEVICE)

# （可选）查看向量维度，便于确认模型输出维数
print("Embedding 维度:", model.get_sentence_embedding_dimension())

# ================== 直接使用你已生成的 all_texts / all_labels / all_ages / all_genders ==================
# 此处不再从 prompts 重建；all_texts 等已在你上一个代码块中构造完毕
print(f"总文本: {len(all_texts)} | 示例前3条：")
for i in range(min(3, len(all_texts))):
    print("-", all_texts[i])

# ================== 批量编码（保持参数风格一致） ==================
embeddings = model.encode(
    all_texts,
    batch_size=32,                 # 可按显存调大/调小
    normalize_embeddings=True,     # 输出向量做 L2 归一化（与余弦相似度匹配）
    convert_to_numpy=True,         # 直接拿到 numpy 数组
    show_progress_bar=True,
)

print("句向量形状:", embeddings.shape)  # 形如 (N, D)

Embedding 维度: 1024
总文本: 72 | 示例前3条：
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 22-year-old female subject, imagined movement of the left hand
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

句向量形状: (72, 1024)


## 4.合并T1，T2, 输出余弦相似距离

In [6]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# --- 1. 定义工具函数 (l2norm) ---
def l2norm(x, axis=-1, eps=1e-12):
    """对Numpy数组进行L2归一化"""
    return x / (np.linalg.norm(x, axis=axis, keepdims=True) + eps)

print("\n" + "="*50)
print("     计算 36 个受试者特定的“平均原型向量”     ")
print("="*50)

# --- 2. 创建文件夹用于保存 ---
output_dir = "subject_prototypes"
os.makedirs(output_dir, exist_ok=True)
print(f"原型向量将保存在: ./{output_dir}/\n")

# --- 3. 准备工作 ---
# (确保 all_labels 和 all_subjects 是Numpy数组，以便于索引)
labels_np = np.array(all_labels)
subjects_np = np.array(all_subjects)

# 预备一个字典来存储36个原型
prototypes = {}
prototype_labels = [] # 用于打印余弦矩阵

# --- 4. 循环计算 9x4=36 个原型 ---
for subject_id in range(1, 10): # 遍历受试者 1 到 9
    for cls in label_text.keys(): # 遍历 4 个类别
        
        # 找到所有符合条件的向量的索引
        # (不区分T1或T2模板)
        indices = np.where(
            (subjects_np == subject_id) & (labels_np == cls)
        )[0]
        
        if len(indices) == 0:
            print(f"警告：未找到 受试者{subject_id} - 类别{cls} 的向量，跳过...")
            continue
            
        # --- 核心步骤: 计算平均原型 ---
        # 1. 从 'embeddings' 中提取所有相关向量
        # 2. .mean(axis=0) 在向量空间中计算均值
        # 3. l2norm(...) 重新归一化，确保它是一个单位向量
        proto_vector = l2norm(embeddings[indices].mean(axis=0, keepdims=True))[0]
        
        # 将Numpy数组转换为PyTorch张量
        proto_tensor = torch.tensor(proto_vector)
        
        # --- 5. 存储原型 ---
        key = f"S{subject_id:02d}_{cls}" # 例如: "S01_left_hand"
        prototypes[key] = proto_tensor
        prototype_labels.append(key)
        
        # 保存为单独的 .pt 文件
        save_path = os.path.join(output_dir, f"{key}.pt")
        torch.save(proto_tensor, save_path)

print(f"成功计算并保存了 {len(prototypes)} 个原型向量。\n")



# --- 6. 简单输出余弦距离 (以受试者1为例) ---
print("="*50)
print("     示例：受试者1 (S01) 的原型间余弦距离     ")
print("="*50)

# 提取受试者1的4个原型向量
s1_keys = [f"S01_{cls}" for cls in label_text.keys()]
s1_vectors = [prototypes[key].numpy() for key in s1_keys] # 转回Numpy以便于sklearn计算


# 使用 cosine_similarity 计算 4x4 的相似度矩阵
# (注意: 1.0 - 相似度 = 余弦距离)
cos_sim_matrix = cosine_similarity(s1_vectors)
cos_dist_matrix = 1.0 - cos_sim_matrix

# 将结果用Pandas美化输出
df_dist = pd.DataFrame(
    cos_dist_matrix,
    columns=label_text.keys(),
    index=label_text.keys()
)

print(df_dist.round(4))

print("\n(提示：对角线为0.0，值越大表示区分度越好)")


     计算 36 个受试者特定的“平均原型向量”     
原型向量将保存在: ./subject_prototypes/

成功计算并保存了 36 个原型向量。

     示例：受试者1 (S01) 的原型间余弦距离     
            left_hand  right_hand    feet  tongue
left_hand     -0.0000      0.0367  0.0503  0.0908
right_hand     0.0367     -0.0000  0.0662  0.1022
feet           0.0503      0.0662 -0.0000  0.1091
tongue         0.0908      0.1022  0.1091  0.0000

(提示：对角线为0.0，值越大表示区分度越好)


In [9]:
print("维度",(prototypes[0]).shape

KeyError: 0