# 尝试加入更多元信息（年龄，性别，噪音）得到有区分的余弦相似度

## 1.加载模型与基础配置

In [1]:
#基本 GPU 信息
import torch, os, subprocess, sys
print("Torch:", torch.__version__, "| CUDA 可用:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    free, total = torch.cuda.mem_get_info()
    print(f"显存: 可用 {free/1024**3:.2f} GB / 总计 {total/1024**3:.2f} GB")

Torch: 2.8.0+cu128 | CUDA 可用: True
GPU: NVIDIA A100 80GB PCIe
显存: 可用 78.84 GB / 总计 79.25 GB


In [2]:
# 导入常用库
import os
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import random

# 固定随机种子，保证 t-SNE 可视化尽量可复现
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 选择设备（有 GPU 就用 GPU）
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cuda'

## 2.准备 BCI IV-2a 的四类英文描述（带指令）

In [3]:
# ================= 模块化 Prompt 生成（动作 / 年龄 / 性别 可自由组合）=================

# 统一的英文指令（保持不变）
INSTRUCT = (
    "The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. "
    "Produce an embedding for retrieval with a strong emphasis on LATERALITY: "
    "if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; "
    "if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. "
    "Do not include any movement for the non-mentioned hand."
)

AGES = [18, 65]        # 年龄列表：你可以按需增删
GENDERS = ["male ", "female "]       # 性别列表：可扩展为 ["male","female","unspecified"] 等
GENDERS = [""] 
# 动作模块（与你已有保持一致；可替换为更强排他的版本）
TEMPLATES = [
    "imagined movement of the {label}",          # 动作句式1
    "motor imagery of the {label}",              # 动作句式2（可替换为更强排他：'; exclude the opposite hand'）
]

# 标签映射（自然短语）
label_text = {
    "left_hand":  "left hand",                   # 左手
    "right_hand": "right hand",                  # 右手
    "feet":       "both feet",                   # 双脚
    "tongue":     "tongue",                      # 舌头
}

# ================== 生成带“年龄+性别+动作”的文本 ==================
prompts_age_gender = {}                           # 用于存储 (age, gender, class) -> 文本列表
for age in AGES:                                  # 遍历年龄
    for gender in GENDERS:                        # 遍历性别
        for cls, phrase in label_text.items():    # 遍历动作类别
            texts = []                            # 存该 (age, gender, class) 的所有句子
            for tpl in TEMPLATES:                 # 遍历动作模板
                # A {age}-year-old {gender} subject, + {动作模块}
                sent = f"A {age}-year-old {gender}subject, " + tpl.format(label=phrase)  # 构造一条完整句子
                texts.append(sent)                # 收集该条句子
            prompts_age_gender[(age, gender, cls)] = texts  # 存入字典

# ================== 展平为 all_texts/all_labels/all_ages/all_genders ==================
all_texts, all_labels, all_ages, all_genders = [], [], [], []  # 预备四个并行列表
for (age, gender, cls), sents in prompts_age_gender.items():    # 遍历字典
    # 给每条句子前面加统一 INSTRUCT（指令感知；与你当前流水线兼容）
    ins_inputs = [INSTRUCT + " " + s for s in sents]            # 指令 + 原句
    all_texts.extend(ins_inputs)                                # 追加到总文本列表
    all_labels.extend([cls] * len(sents))                       # 记录对应的类别标签
    all_ages.extend([age] * len(sents))                         # 记录对应的年龄
    all_genders.extend([gender] * len(sents))                   # 记录对应的性别

#（可选）快速检查规模
print(f"总文本: {len(all_texts)}  | 年龄×性别×类别×模板 = {len(AGES)}×{len(GENDERS)}×{len(label_text)}×{len(TEMPLATES)}")
print("示例前3条：")
for i in range(min(3, len(all_texts))):
    print("-", all_texts[i])

# 之后可直接用：
# embeddings = model.encode(all_texts, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
# 并利用 all_labels / all_ages / all_genders 做分组对比（如年龄内/性别内的余弦距离矩阵）

总文本: 16  | 年龄×性别×类别×模板 = 2×1×4×2
示例前3条：
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 18-year-old subject, imagined movement of the left hand
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 18

## 3.加载 Qwen3-Embedding-0.6B 并批量编码

In [4]:
# ================== 加载模型（保持不变） ==================
# 加载 Qwen3-Embedding-0.6B
model_name = "Qwen/Qwen3-Embedding-0.6B"

# 说明：
# - SentenceTransformer 会自动处理 tokenizer + pooling，并支持 normalize_embeddings=True
# - 需较新版本的 transformers>=4.51.0、sentence-transformers>=2.7.0
model = SentenceTransformer(model_name, device=DEVICE)

# （可选）查看向量维度，便于确认模型输出维数
print("Embedding 维度:", model.get_sentence_embedding_dimension())

# ================== 直接使用你已生成的 all_texts / all_labels / all_ages / all_genders ==================
# 此处不再从 prompts 重建；all_texts 等已在你上一个代码块中构造完毕
print(f"总文本: {len(all_texts)} | 示例前3条：")
for i in range(min(3, len(all_texts))):
    print("-", all_texts[i])

# ================== 批量编码（保持参数风格一致） ==================
embeddings = model.encode(
    all_texts,
    batch_size=32,                 # 可按显存调大/调小
    normalize_embeddings=True,     # 输出向量做 L2 归一化（与余弦相似度匹配）
    convert_to_numpy=True,         # 直接拿到 numpy 数组
    show_progress_bar=True,
)

print("句向量形状:", embeddings.shape)  # 形如 (N, D)

Embedding 维度: 1024
总文本: 16 | 示例前3条：
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 18-year-old subject, imagined movement of the left hand
- The subject is seated comfortably in an armchair facing a computer screen, performing motor imagery tasks. Produce an embedding for retrieval with a strong emphasis on LATERALITY: if the text mentions the LEFT hand, assume exclusively left-hand imagery with NO right-hand involvement; if the text mentions the RIGHT hand, assume exclusively right-hand imagery with NO left-hand involvement. Do not include any movement for the non-mentioned hand. A 18-yea

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

句向量形状: (16, 1024)


## 4.余弦相似距离

In [5]:
# ========= 计算：不同模板 × 不同年龄 × 不同性别 的“左右手区分度” =========
import numpy as np
import pandas as pd

# 1) 从文本中识别模板ID（根据句式前缀，与你的 TEMPLATES 一致）
def get_template_id_from_text(text, instruct=INSTRUCT):
    s = text.strip()
    if s.startswith(instruct):
        s = s[len(instruct):].strip()
    s = s.lower()
    # 你的两个模板：
    # "imagined movement of the {label}"
    # "motor imagery of the {label}"
    if "imagined movement of the" in s:
        return "T1"  # imagined movement...
    if "motor imagery of the" in s:
        return "T2"  # motor imagery...
    return "OTHER"

template_ids = [get_template_id_from_text(t) for t in all_texts]

# 2) 小工具：L2 归一化 + 余弦距离
def l2norm(x, axis=-1, eps=1e-12):
    return x / (np.linalg.norm(x, axis=axis, keepdims=True) + eps)

def cosine_distance(a, b):
    # a,b 已是单位向量时：1 - 点积 就是余弦距离
    return float(1.0 - np.dot(a, b))

# 3) 逐个 (模板, 年龄, 性别) 计算 左/右手 的原型距离
records = []
for tpl in ["T1", "T2"]:
    for age in AGES:
        for gender in GENDERS:
            # 收集左/右手索引（该模板 + 该年龄 + 该性别）
            idx_left = [
                i for i, (lab, ag, gd, tp) in enumerate(zip(all_labels, all_ages, all_genders, template_ids))
                if lab == "left_hand" and ag == age and gd == gender and tp == tpl
            ]
            idx_right = [
                i for i, (lab, ag, gd, tp) in enumerate(zip(all_labels, all_ages, all_genders, template_ids))
                if lab == "right_hand" and ag == age and gd == gender and tp == tpl
            ]

            if len(idx_left) == 0 or len(idx_right) == 0:
                # 缺数据时标记为 NaN，便于表格显示
                records.append({
                    "template": tpl,
                    "age": age,
                    "gender": gender.strip(),
                    "cos_dist_left_vs_right": np.nan,
                    "n_left": len(idx_left),
                    "n_right": len(idx_right),
                })
                continue

            # 计算左/右原型（类内均值 -> 再 L2）
            proto_left  = l2norm(embeddings[idx_left].mean(axis=0, keepdims=True))[0]
            proto_right = l2norm(embeddings[idx_right].mean(axis=0, keepdims=True))[0]

            # 余弦距离（越大越可分）
            dist = cosine_distance(proto_left, proto_right)

            records.append({
                "template": tpl,
                "age": age,
                "gender": gender.strip(),
                "cos_dist_left_vs_right": dist,
                "n_left": len(idx_left),
                "n_right": len(idx_right),
            })

# 4) 汇总为表格，并按模板/年龄/性别打印
df = pd.DataFrame(records).sort_values(["template", "age", "gender"])
print("\n=== Left vs Right separation by (template × age × gender) ===")
print(df.to_string(index=False, formatters={"cos_dist_left_vs_right": lambda x: f"{x:.3f}" if pd.notna(x) else "NaN"}))

# 5) 可选：做个透视表，快速对比不同年龄/性别
print("\n=== Pivot: cos_dist (rows=age, cols=gender) per template ===")
for tpl in ["T1", "T2"]:
    sub = df[df["template"] == tpl].pivot(index="age", columns="gender", values="cos_dist_left_vs_right")
    print(f"\n[Template {tpl}]")
    print(sub.round(3))

# 6) 可选：模板整体均值（看哪种模板总体区分度更好）
tpl_mean = df.groupby("template")["cos_dist_left_vs_right"].mean()
print("\n=== Mean separation per template (higher is better) ===")
print(tpl_mean.round(3))



=== Left vs Right separation by (template × age × gender) ===
template  age gender cos_dist_left_vs_right  n_left  n_right
      T1   18                         0.037       1        1
      T1   65                         0.036       1        1
      T2   18                         0.034       1        1
      T2   65                         0.034       1        1

=== Pivot: cos_dist (rows=age, cols=gender) per template ===

[Template T1]
gender       
age          
18      0.037
65      0.036

[Template T2]
gender       
age          
18      0.034
65      0.034

=== Mean separation per template (higher is better) ===
template
T1    0.036
T2    0.034
Name: cos_dist_left_vs_right, dtype: float64


In [6]:
print("\n" + "="*50)
print("     T1 与 T2 合并后的左右手区分度 (按年龄 x 性别)     ")
print("="*50)

# 1. 预备一个列表来存储合并后的记录
records_merged = []

# 2. 逐个 (年龄, 性别) 计算 左/右手 的原型距离
#    (注意：这里的循环移除了 'tpl'，只按 age 和 gender 遍历)
for age in AGES:
    for gender in GENDERS:
        # 收集左/右手索引（该年龄 + 该性别；忽略模板差异）
        # (我们不再检查 'tp == tpl')
        idx_left_merged = [
            i for i, (lab, ag, gd) in enumerate(zip(all_labels, all_ages, all_genders))
            if lab == "left_hand" and ag == age and gd == gender
        ]
        idx_right_merged = [
            i for i, (lab, ag, gd) in enumerate(zip(all_labels, all_ages, all_genders))
            if lab == "right_hand" and ag == age and gd == gender
        ]
        
        # (检查数据的逻辑保持不变)
        if len(idx_left_merged) == 0 or len(idx_right_merged) == 0:
            records_merged.append({
                "age": age,
                "gender": gender.strip(),
                "cos_dist_left_vs_right": np.nan,
                "n_left": len(idx_left_merged),
                "n_right": len(idx_right_merged),
            })
            continue

        # 计算合并后的左/右原型 (包含T1和T2的向量)
        proto_left_merged  = l2norm(embeddings[idx_left_merged].mean(axis=0, keepdims=True))[0]
        proto_right_merged = l2norm(embeddings[idx_right_merged].mean(axis=0, keepdims=True))[0]

        # 余弦距离（越大越可分）
        dist_merged = cosine_distance(proto_left_merged, proto_right_merged)

        records_merged.append({
            "age": age,
            "gender": gender.strip(),
            "cos_dist_left_vs_right": dist_merged,
            "n_left": len(idx_left_merged),
            "n_right": len(idx_right_merged),
        })

# 3. 汇总为表格，并按年龄/性别打印
df_merged = pd.DataFrame(records_merged).sort_values(["age", "gender"])
print("\n=== Left vs Right separation by (age × gender) [T1+T2 Merged] ===")
print(df_merged.to_string(index=False, formatters={"cos_dist_left_vs_right": lambda x: f"{x:.4f}" if pd.notna(x) else "NaN"}))

# 4. (可选) 计算所有情况的总体平均值
overall_mean_dist = df_merged["cos_dist_left_vs_right"].mean()
print(f"\n=== T1+T2 总体平均区分度 (所有年龄和性别的平均值) ===")
print(f"{overall_mean_dist:.4f}")


     T1 与 T2 合并后的左右手区分度 (按年龄 x 性别)     

=== Left vs Right separation by (age × gender) [T1+T2 Merged] ===
 age gender cos_dist_left_vs_right  n_left  n_right
  18                        0.0356       2        2
  65                        0.0349       2        2

=== T1+T2 总体平均区分度 (所有年龄和性别的平均值) ===
0.0352
