## 0. 导入依赖 & 基础配置

In [1]:
import pickle, pandas
from MiCoGPT.utils.tools import extract_taxon
from MiCoGPT.utils.tokenizer import MiCoGPTokenizer

# 物种组成表 CSV 路径，其第一列是带有前缀的物种名
CSV_PATH = "../data/try2_withCC/abundance_all_90338.csv"
TOKENIZER_PKL_PATH = "../MiCoGPT/resources/MiCoGPTokenizer.pkl"

## A. 生成 MiCoGPTokenizer.pkl
### A.1. 读取 CSV，从第一列提取 genus 列表并去重

In [2]:
df = pandas.read_csv(CSV_PATH)
print("CSV shape:", df.shape)

full_taxon_name = df.iloc[:, 0]  # 取 CSV 的第一列的内容进行处理

taxon_tokens_total = 0
uniq_taxon_tokens = []
seen_taxon_tokens = set()

for i in full_taxon_name:
    tax = extract_taxon(i, "Genus")
    if tax is None:
        continue
    taxon_tokens_total += 1
    if tax not in seen_taxon_tokens:
        seen_taxon_tokens.add(tax)
        uniq_taxon_tokens.append(tax)

print(f"Total taxon tokens: {taxon_tokens_total}，Unique taxon tokens: {len(uniq_taxon_tokens)}")
print("First 5 unique taxon tokens:", uniq_taxon_tokens[:5])


CSV shape: (1117, 90339)
Total taxon tokens: 1117，Unique taxon tokens: 1117
First 5 unique taxon tokens: ['g__Stenotrophomonas', 'g__Bacteriovorax', 'g__Idiomarina', 'g__Eubacterium', 'g__Methylobacillus']


### A.2. 构建 tokenizer 并保存为 tokenizer.pkl

In [3]:
special_tokens = ['<pad>', '<mask>', '<bos>', '<eos>']
all_tokens = special_tokens + uniq_taxon_tokens

tokenizer = MiCoGPTokenizer(all_tokens)

with open(TOKENIZER_PKL_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

print("MiCoGPTokenizer saved:", TOKENIZER_PKL_PATH)

MiCoGPTokenizer saved: ../MiCoGPT/resources/MiCoGPTokenizer.pkl
