## 0. 导入依赖 & 基础配置 (v2)
在此版本中，我们不仅提取物种 Token，还将提取 Metadata 中的指定列作为 Token。

In [1]:
import pickle, pandas
import numpy as np
from MiCoGPT.utils.tools import extract_taxon
from MiCoGPT.utils.tokenizer import MiCoGPTokenizer

# 1. 路径配置
CSV_PATH = "../data/try2_withCC/abundance_all_90338.csv"
META_PATH = "../data/try2_withCC/Phyloseq_ResMicroDb_metadata_merge_v2_251129.tsv"
# 输出 Tokenizer v2 路径
TOKENIZER_PKL_PATH = "../MiCoGPT/resources/MiCoGPTokenizer_v2.pkl"

# 2. 指定需要加入 Token 的 Metadata 列
# 请在此列表中手动指定需要包含的列名
# 注意：每一列的空值会被单独编码为 meta__{col}:<NA>
TARGET_META_COLS = [
    "Sample_Site",
    # "Sex",
    # "Age_Group",
    # "Continent",
    # "Country",
    # "Region_16S",
    # "Is_Healthy",
]

print(f"Target Metadata Columns: {TARGET_META_COLS}")

Target Metadata Columns: ['Sample_Site']


## A. 提取物种 Token (Taxon)
与 v1 逻辑一致，从 CSV 第一列提取 Genus。

In [2]:
df = pandas.read_csv(CSV_PATH)
print("CSV shape:", df.shape)

full_taxon_name = df.iloc[:, 0]  # 取 CSV 的第一列的内容进行处理

taxon_tokens_total = 0
uniq_taxon_tokens = []
seen_taxon_tokens = set()

for i in full_taxon_name:
    tax = extract_taxon(i, "Genus")
    if tax is None:
        continue
    taxon_tokens_total += 1
    if tax not in seen_taxon_tokens:
        seen_taxon_tokens.add(tax)
        uniq_taxon_tokens.append(tax)

print(f"Total taxon tokens: {taxon_tokens_total}，Unique taxon tokens: {len(uniq_taxon_tokens)}")
print("First 5 unique taxon tokens:", uniq_taxon_tokens[:5])

CSV shape: (1117, 90339)
Total taxon tokens: 1117，Unique taxon tokens: 1117
First 5 unique taxon tokens: ['g__Stenotrophomonas', 'g__Bacteriovorax', 'g__Idiomarina', 'g__Eubacterium', 'g__Methylobacillus']


## B. 提取 Metadata Token
根据 `TARGET_META_COLS` 提取 Metadata Token。
Token 命名规则：`meta__{col_name}:{value}`
空值命名规则：`meta__{col_name}:<NA>`

In [3]:
# 读取 Metadata
meta_df = pandas.read_csv(META_PATH, sep="\t", low_memory=False)
print("Metadata shape:", meta_df.shape)

meta_tokens = []
seen_meta_tokens = set()

for col in TARGET_META_COLS:
    if col not in meta_df.columns:
        print(f"[Warning] Column '{col}' not found in metadata, skipping.")
        continue
    
    print(f"Processing column: {col}...")
    unique_vals = meta_df[col].unique()
    
    # 1. 强制添加该列专属的空值 Token
    na_token = f"meta__{col}:<NA>"
    if na_token not in seen_meta_tokens:
        meta_tokens.append(na_token)
        seen_meta_tokens.add(na_token)
        
    # 2. 添加其他值
    count = 0
    for val in unique_vals:
        # 检查是否为空值
        if pandas.isna(val) or str(val).strip() == "" or str(val).lower() == "nan":
            continue
            
        val_str = str(val).strip()
        token = f"meta__{col}:{val_str}"
        
        if token not in seen_meta_tokens:
            meta_tokens.append(token)
            seen_meta_tokens.add(token)
            count += 1
            
    print(f"  -> Added {count} tokens for '{col}' (excluding <NA>)")

print(f"Total unique metadata tokens generated: {len(meta_tokens)}")
print("First 10 metadata tokens:", meta_tokens[:10])

Metadata shape: (90338, 43)
Processing column: Sample_Site...
  -> Added 10 tokens for 'Sample_Site' (excluding <NA>)
Total unique metadata tokens generated: 11
First 10 metadata tokens: ['meta__Sample_Site:<NA>', 'meta__Sample_Site:BALF', 'meta__Sample_Site:Nasopharynx', 'meta__Sample_Site:Sputum', 'meta__Sample_Site:Oropharynx', 'meta__Sample_Site:Trachea', 'meta__Sample_Site:Throat', 'meta__Sample_Site:Bronchus', 'meta__Sample_Site:Nasal', 'meta__Sample_Site:Lung Tissue']


## C. 构建并保存 Tokenizer v2

In [4]:
special_tokens = ['<pad>', '<mask>', '<bos>', '<eos>']

# 合并所有 Tokens: Special + Metadata + Taxon
all_tokens = special_tokens + meta_tokens + uniq_taxon_tokens

tokenizer = MiCoGPTokenizer(all_tokens)

with open(TOKENIZER_PKL_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

print(f"MiCoGPTokenizer v2 saved: {TOKENIZER_PKL_PATH}")
print(f"Final vocab size: {len(tokenizer.vocab)}")

MiCoGPTokenizer v2 saved: ../MiCoGPT/resources/MiCoGPTokenizer_v2.pkl
Final vocab size: 1132
