## 0. 导入依赖 & 基础配置 (vCross)

In [None]:
import pickle, pandas
import numpy as np
from MiCoGPT.utils.tools import extract_taxon
from MiCoGPT.utils.tokenizer import MiCoGPTokenizer

CSV_PATH = "../data/abundance_all_90338.csv"
TOKENIZER_PKL_PATH = "../MiCoGPT/resources/MiCoGPTokenizer_vCross.pkl"

## A. 提取物种 Token (Taxon)
从 CSV 第一列提取 Genus。这是我们词表的核心内容。

In [3]:
df = pandas.read_csv(CSV_PATH)
print("CSV shape:", df.shape)

full_taxon_name = df.iloc[:, 0]  # 取 CSV 的第一列的内容进行处理

taxon_tokens_total = 0
uniq_taxon_tokens = []
seen_taxon_tokens = set()

for i in full_taxon_name:
    tax = extract_taxon(i, "Genus")
    if tax is None:
        continue
    taxon_tokens_total += 1
    if tax not in seen_taxon_tokens:
        seen_taxon_tokens.add(tax)
        uniq_taxon_tokens.append(tax)

print(f"Total taxon tokens: {taxon_tokens_total}，Unique taxon tokens: {len(uniq_taxon_tokens)}")
print("First 5 unique taxon tokens:", uniq_taxon_tokens[:5])

CSV shape: (1117, 90339)
Total taxon tokens: 1117，Unique taxon tokens: 1117
First 5 unique taxon tokens: ['g__Stenotrophomonas', 'g__Bacteriovorax', 'g__Idiomarina', 'g__Eubacterium', 'g__Methylobacillus']


## B. 构建并保存 Tokenizer vCross

In [4]:
special_tokens = ['<pad>', '<mask>', '<bos>', '<eos>']

all_tokens = special_tokens + uniq_taxon_tokens

tokenizer = MiCoGPTokenizer(all_tokens)

with open(TOKENIZER_PKL_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

print(f"MiCoGPTokenizer vCross saved: {TOKENIZER_PKL_PATH}")
print(f"Final vocab size: {len(tokenizer.vocab)}")
print("Sample tokens:", tokenizer.convert_ids_to_tokens([0, 1, 2, 3, 4, 5]))

MiCoGPTokenizer vCross saved: ../MiCoGPT/resources/MiCoGPTokenizer_vCross.pkl
Final vocab size: 1121
Sample tokens: ['<pad>', '<mask>', '<bos>', '<eos>', 'g__Stenotrophomonas', 'g__Bacteriovorax']
