## 0. 导入依赖 & 基础配置

In [1]:
import pickle, pandas
from MiCoGPT.utils.tools import extract_taxon
from MiCoGPT.utils.tokenizer import MiCoGPTokenizer

# 物种组成表 CSV 路径，其第一列是带有前缀的物种名
CSV_PATH = "../data/try2_withCC/abundance_all_90338.csv"
TOKENIZER_PKL_PATH = "../MiCoGPT/resources/MiCoGPTokenizer.pkl"

## A. 生成 MiCoGPTokenizer.pkl
### A.1. 读取 CSV，从第一列提取 genus 列表并去重

In [2]:
df = pandas.read_csv(CSV_PATH)
print("CSV shape:", df.shape)

full_tax_name = df.iloc[:, 0]  # 取 CSV 的第一列的内容进行处理

tax_tokens_total = 0
uniq_tax_tokens = []
seen_tax_tokens = set()

for i in full_tax_name:
    tax = extract_taxon(i, "Genus")
    if tax is None:
        continue
    tax_tokens_total += 1
    if tax not in seen_tax_tokens:
        seen_tax_tokens.add(tax)
        uniq_tax_tokens.append(tax)

print(f"Total tokens: {tax_tokens_total}，Unique tokens: {len(uniq_tax_tokens)}")
print("First 5 unique tokens:", uniq_tax_tokens[:5])

CSV shape: (1117, 90339)
Total tokens: 1117，Unique tokens: 1117
First 5 unique tokens: ['g__Stenotrophomonas', 'g__Bacteriovorax', 'g__Idiomarina', 'g__Eubacterium', 'g__Methylobacillus']


### A.2. 构建 tokenizer 并保存为 tokenizer.pkl

In [3]:
all_tokens = ['<pad>', '<mask>'] + uniq_tax_tokens

tokenizer = MiCoGPTokenizer(all_tokens)

with open(TOKENIZER_PKL_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

print("MiCoGPTokenizer saved:", TOKENIZER_PKL_PATH)

MiCoGPTokenizer saved: ../MiCoGPT/resources/MiCoGPTokenizer.pkl
