## 0. 导入依赖 & 基础配置

In [1]:
import re
from pickle import dump, load   # 用 pickle 来保存 / 读取 tokenizer

import numpy as np
import pandas as pd
from transformers import PreTrainedTokenizer
from typing import List

# 物种组成表 CSV 路径，其第一列是带有前缀的物种名
CSV_PATH = "../data/try2_withCC/abundance_all_90338.csv"
TAXON_COLUMN = 0
TOKENIZER_PKL_PATH = "../MiCoGPT/resources/MiCoGPTokenizer.pkl"
OUTPUT_PHYLO_FILE = "../MiCoGPT/resources/phylogeny.csv"

## 0.1. 定义 “多前缀 → g__属名” 的转换函数

In [2]:
def extract_genus_like_microcorpus(raw_name: str) -> str:

    if raw_name is None:
        return None
    
    name = str(raw_name).strip()
    if not name:
        return None
    
    name = name.replace("; ", ";")             # 1) 去掉分号后的空格
    name = re.sub(r";s__.*", "", name)         # 2) 去掉物种级：";s__XXXXX" 以及后面的内容
    name = re.sub(r"^k__", "sk__", name)       # 3) 如果以 k__ 开头，换成 sk__
    m = re.search(r"(g__[^;]+)", name)         # 4) 抽取 'g__XXXX'
    
    if m:
        return m.group(1)   # 标准格式 g__Genus
    #    - 如果本身就以 g__ 开头，直接用
    if name.startswith("g__"):
        return name
    #    - 否则退一步：取最后一级作为“属名”，同时打印 warning 方便你检查
    #      比如 "k__Bacteria; p__Firmicutes; ...; Bacteroides"
    parts = name.split(";")
    fallback = parts[-1]
    print(f"[warning] 未找到 g__ 前缀，使用最后一级作为 token: {raw_name!r} -> {fallback!r}")
    return fallback

## A. 生成 MiCoGPTokenizer.pkl
### A.1. 定义 tokenizer

In [27]:
class MiCoGPTokenizer(PreTrainedTokenizer):
    """
    极简 tokenizer：
    - 输入一个 token 列表（例如 ['<pad>', '<mask>', 'g__Bacteroides', ...]）
    - 建立 token -> id 和 id -> token 映射
    - 注册特殊 token: <pad>, <mask>, <bos>, <eos>
    """
    def __init__(self, toks, **kwargs):
        super().__init__(**kwargs)
        
        self.toks = list(toks)
        self.vocab = {v: i for i, v in enumerate(self.toks)}
        self.ids_to_tokens = {i: v for i, v in enumerate(self.toks)}
        
        # 注册特殊 token
        self.add_special_tokens({
            'pad_token': '<pad>',
            'mask_token': '<mask>',
            'bos_token': '<bos>',
            'eos_token': '<eos>',
        })
    
    def _tokenize(self, text):
        return list(text)
    
    def _add_tokens(self, new_tokens: List[str], special_tokens: bool = False) -> int:
        new_tokens = [tok for tok in new_tokens if tok not in self.vocab]
        if not new_tokens:
            return 0
        
        self.toks.extend(new_tokens)
        self.vocab = {v: i for i, v in enumerate(self.toks)}
        self.ids_to_tokens = {i: v for i, v in enumerate(self.toks)}
        return len(new_tokens)
    
    def _convert_token_to_id(self, token):
        return self.vocab[token]
    
    def _convert_id_to_token(self, index):
        return self.ids_to_tokens[index]
    
    def get_vocab(self):
        return self.vocab
    
    def get_vocab_size(self):
        return len(self.vocab)
    
    @property
    def vocab_size(self):
        return len(self.vocab)

### A.2. 读取 CSV，从第一列提取 genus 列表，去重

In [29]:
df = pd.read_csv(CSV_PATH)

print("CSV 维度:", df.shape)

# 取出第一列(taxonomy)
if isinstance(TAXON_COLUMN, int):
    taxon_series = df.iloc[:, TAXON_COLUMN]
else:
    taxon_series = df[TAXON_COLUMN]

print("taxonomy 列非空行数:", taxon_series.notna().sum())

# 对每一行做“多前缀 → g__属名” 处理
genus_tokens = []
for s in taxon_series:
    g = extract_genus_like_microcorpus(s)
    if g is not None:
        genus_tokens.append(g)

print(f"经过处理后得到的 genus token 数量（含重复）: {len(genus_tokens)}")

# 去重
seen = set()
uniq_non_special_toks = []
for t in genus_tokens:
    if t not in seen:
        uniq_non_special_toks.append(t)
        seen.add(t)

print(f"去重后非特殊 token 数量: {len(uniq_non_special_toks)}")
print("前 10 个 genus token:", uniq_non_special_toks[:10])

CSV 维度: (1117, 90339)
taxonomy 列非空行数: 1117
经过处理后得到的 genus token 数量（含重复）: 1117
去重后非特殊 token 数量: 1117
前 10 个 genus token: ['g__Stenotrophomonas', 'g__Bacteriovorax', 'g__Idiomarina', 'g__Eubacterium', 'g__Methylobacillus', 'g__Larkinella', 'g__Fonticella', 'g__Klebsiella', 'g__Merdibacter', 'g__Fibrobacter']


### A.3. 构建 tokenizer 并保存为 tokenizer.pkl

In [30]:
SPECIAL_TOKENS = ['<pad>', '<mask>']
toks = SPECIAL_TOKENS + uniq_non_special_toks

print("总 token 数量（包含 <pad>, <mask>，后续还会注册 <bos>, <eos>）:", len(toks))

tokenizer = MiCoGPTokenizer(toks)

with open(TOKENIZER_PKL_PATH, "wb") as f:
    dump(tokenizer, f)

print("MiCoGPTokenizer 已保存到:", TOKENIZER_PKL_PATH)

总 token 数量（包含 <pad>, <mask>，后续还会注册 <bos>, <eos>）: 1119
MiCoGPTokenizer 已保存到: ../MiCoGPT/resources/MiCoGPTokenizer.pkl


### A.4. 加载并检查 tokenizer.pkl 

In [32]:
with open(TOKENIZER_PKL_PATH, "rb") as f:
    tokenizer_loaded = load(f)

print("载入后的 vocab_size:", tokenizer_loaded.vocab_size)
print("pad_token:", tokenizer_loaded.pad_token, "id =", tokenizer_loaded.pad_token_id)
print("bos_token:", tokenizer_loaded.bos_token, "id =", tokenizer_loaded.bos_token_id)
print("eos_token:", tokenizer_loaded.eos_token, "id =", tokenizer_loaded.eos_token_id)

print("\n前 10 个 token -> id 映射：")
for i, (tok, idx) in enumerate(list(tokenizer_loaded.get_vocab().items())[:10]):
    print(f"{i:2d}: token={tok!r}, id={idx}")

载入后的 vocab_size: 1121
pad_token: <pad> id = 0
bos_token: <bos> id = 1119
eos_token: <eos> id = 1120

前 10 个 token -> id 映射：
 0: token='<pad>', id=0
 1: token='<mask>', id=1
 2: token='g__Stenotrophomonas', id=2
 3: token='g__Bacteriovorax', id=3
 4: token='g__Idiomarina', id=4
 5: token='g__Eubacterium', id=5
 6: token='g__Methylobacillus', id=6
 7: token='g__Larkinella', id=7
 8: token='g__Fonticella', id=8
 9: token='g__Klebsiella', id=9


## B. 生成 phylogeny.csv

In [5]:
# 读取数据
print(f"读取数据: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, sep=',', index_col=0)

# 处理索引名称 (Index)
df.index = df.index.map(extract_genus_like_microcorpus)

# 计算相对丰度 (Relative Abundance)
df_rel = df.div(df.sum(axis=0), axis=1)

# 计算 Mean 和 Std 并保存,构造结果 DataFrame
stats_df = pd.DataFrame({
    'mean': df_rel.mean(axis=1),
    'std':  df_rel.std(axis=1).replace(0, 1e-9) # 防止方差为0
})

stats_df.index.name = '#SampleID' # 保持格式兼容

stats_df.to_csv(OUTPUT_PHYLO_FILE)

print(f"已生成: {OUTPUT_PHYLO_FILE}")
print(stats_df.head())

读取数据: ../data/try2_withCC/abundance_all_90338.csv
已生成: ../MiCoGPT/resources/phylogeny.csv
                         mean       std
#SampleID                              
g__Stenotrophomonas  0.008533  0.070172
g__Bacteriovorax     0.000018  0.000271
g__Idiomarina        0.000002  0.000132
g__Eubacterium       0.000002  0.000168
g__Methylobacillus   0.000023  0.000711
