## 0. 导入依赖

In [None]:
from configparser import ConfigParser          # 读取 config.ini
from pathlib import Path                       # 处理路径
from pickle import dump                        # 保存构建好的 MicroCorpus

import sys
import os
sys.path.append(os.path.abspath(".."))  # 从 notebooks/ 回到外面的 MiCoGPT 根目录
from MiCoGPT.utils.mgm_MicroCorpus import MicroCorpus   # 核心数据集类
from MiCoGPT.utils.mgm_utils import CustomUnpickler     # 自定义 Unpickler，用于反序列化 MicroTokenizer

## 1. 定义 construct_simple 函数

In [None]:
def construct_simple(
    input_path: str,       # 输入丰度表文件路径（h5 / csv / tsv / txt）
    output_path: str,      # 输出的 MicroCorpus pkl 路径
    config_path: str,      # config.ini 的路径
    tokenizer_path: str,   # MicroTokenizer.pkl 的路径
    key: str = "genus",
    no_normalize: bool = False,  # 是否跳过归一化
):

    # config.ini，拿到 max_len
    cfg = ConfigParser()
    cfg.read(config_path)
    max_len = cfg.getint("construct", "max_len")

    print(f"[construct] 从配置文件中读取到 max_len = {max_len}")


    # 加载 MicroTokenizer.pkl
    tokenizer_path = Path(tokenizer_path)
    if not tokenizer_path.is_file():
        raise FileNotFoundError(f"找不到 tokenizer 文件：{tokenizer_path}")

    with open(tokenizer_path, "rb") as f:
        # 用自定义的 CustomUnpickler 来反序列化，是为了在不同环境下
        # 也能正确找到 MicroTokenizer 类所在的模块。
        unpickler = CustomUnpickler(f)
        tokenizer = unpickler.load()

    # 顺便打印一点信息帮助理解 tokenizer
    try:
        vocab_size = len(tokenizer.vocab)
        print(f"[construct] tokenizer 加载完成，词表大小 = {vocab_size}")
    except Exception:
        print("[construct] tokenizer 已加载（未访问 vocab，可能实现不同）")

    # 打印归一化提示
    if not no_normalize:
        print(
            "Your data will be normalized with the phylogeny mean and std. "
            "If you wish to use your own normalization, please set no_normalize=True."
        )


    #  构建 MicroCorpus
    input_path = Path(input_path)
    if not input_path.is_file():
        raise FileNotFoundError(f"找不到输入丰度表文件：{input_path}")

    corpus = MicroCorpus(
        data_path=str(input_path),
        tokenizer=tokenizer,
        key=key,
        max_len=max_len,
        preprocess=not no_normalize,
    )

    print("[construct] MicroCorpus 已构建完成。")
    try:
        print(f"[construct] 语料库样本数量 = {len(corpus)}")
    except TypeError:
        print("[construct] corpus 不支持 len()，可以自行检查内部实现。")


    # 保存到 output_path
    output_path = Path(output_path)
    with open(output_path, "wb") as f:
        dump(corpus, f)

    print(f"[construct] 语料库已保存到：{output_path.resolve()}")

    # 返回 corpus，方便后续在 Notebook 里分析
    return corpus

## 2. 执行

In [3]:
# 设置根目录
project_root = Path("/Users/kiancai/STA24/CWD/STAi/MiCoGPT")

# 生成路径
input_path = project_root / "data" / "try2_withCC/abundance_C_1880.csv"
output_path = project_root / "data" / "try2_withCC/abundance_C_1880.pkl"
config_path = project_root / "MiCoGPT" / "resources" / "mgm_config.ini"
tokenizer_path = project_root / "MiCoGPT" / "resources" / "mgm_MicroTokenizer.pkl"

print("[path] input_path    =", input_path)
print("[path] output_path   =", output_path)
print("[path] config_path   =", config_path)
print("[path] tokenizer_path=", tokenizer_path)

corpus = construct_simple(
    input_path=str(input_path),
    output_path=str(output_path),
    config_path=str(config_path),
    tokenizer_path=str(tokenizer_path),
    key="genus",
    no_normalize=False,
)

[path] input_path    = /Users/kiancai/STA24/CWD/STAi/MiCoGPT/data/try2_withCC/abundance_C_1880.csv
[path] output_path   = /Users/kiancai/STA24/CWD/STAi/MiCoGPT/data/try2_withCC/abundance_C_1880.pkl
[path] config_path   = /Users/kiancai/STA24/CWD/STAi/MiCoGPT/MiCoGPT/resources/mgm_config.ini
[path] tokenizer_path= /Users/kiancai/STA24/CWD/STAi/MiCoGPT/MiCoGPT/resources/mgm_MicroTokenizer.pkl
[construct] 从配置文件中读取到 max_len = 512
[construct] tokenizer 加载完成，词表大小 = 9669
Your data will be normalized with the phylogeny mean and std. If you wish to use your own normalization, please set no_normalize=True.
0 samples are dropped for all zeroes


100%|██████████| 1880/1880 [00:00<00:00, 6819.99it/s]


Total 1880 samples.
            Max length is 470.
            Average length is 59.44042553191489.
            Min length is 7.
[construct] MicroCorpus 已构建完成。
[construct] 语料库样本数量 = 1880
[construct] 语料库已保存到：/Users/kiancai/STA24/CWD/STAi/ResMicroDb/data/try2_withCC/abundance_C_1880.pkl


## 3. 检查生成的结果结构

In [4]:
print("corpus 类型:", type(corpus))

# 1. 语料库大小
try:
    num_samples = len(corpus)
    print("样本数（len(corpus)）:", num_samples)
except Exception as e:
    print("无法对 corpus 调用 len()，错误:", e)
    num_samples = 0

# 2. 看看第一个样本长什么样
if num_samples > 0:
    sample = corpus[0]
    print("\n第一个样本类型:", type(sample))

    if isinstance(sample, dict):
        print("样本包含的键:", sample.keys())
        # 尝试看一下 input_ids / attention_mask 的形状和前几个值
        input_ids = sample.get("input_ids", None)
        attention_mask = sample.get("attention_mask", None)

        if input_ids is not None:
            print("input_ids 形状:", getattr(input_ids, "shape", "无 shape 属性"))
            try:
                print("前 20 个 token id:", input_ids[:20].tolist())
            except Exception:
                print("无法直接打印 input_ids[:20].tolist()，可以手动 print(input_ids) 看看")

        if attention_mask is not None:
            print("attention_mask 形状:", getattr(attention_mask, "shape", "无 shape 属性"))
            try:
                print("前 20 个 attention_mask:", attention_mask[:20].tolist())
            except Exception:
                print("无法直接打印 attention_mask[:20].tolist()，可以手动 print(attention_mask) 看看")
    else:
        print("样本不是 dict，直接打印样本内容：")
        print(sample)
else:
    print("corpus 为空，没有样本可以查看。")


corpus 类型: <class 'MiCoGPT.utils.mgm_MicroCorpus.MicroCorpus'>
样本数（len(corpus)）: 1880

第一个样本类型: <class 'dict'>
样本包含的键: dict_keys(['input_ids', 'attention_mask'])
input_ids 形状: torch.Size([512])
前 20 个 token id: [2, 7858, 8081, 6053, 8597, 5318, 5456, 662, 109, 9494, 8050, 6814, 765, 2590, 6748, 228, 1298, 1070, 5464, 280]
attention_mask 形状: torch.Size([512])
前 20 个 attention_mask: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


  return {'input_ids': torch.tensor(tokens),


In [5]:
import pickle
from pathlib import Path

pkl_path = Path("/Users/kiancai/STA24/CWD/STAi/MiCoGPT/MiCoGPT/resources/mgm_MicroTokenizer.pkl")  # 根据你的真实路径改一下

with open(pkl_path, "rb") as f:
    from MiCoGPT.utils.mgm_utils import CustomUnpickler
    tokenizer = CustomUnpickler(f).load()

print(type(tokenizer))
print(dir(tokenizer))          # 看看有哪些属性/方法
print(getattr(tokenizer, "vocab", None))  # 看看词表（如果有这个属性）


<class 'MiCoGPT.utils.mgm_MicroCorpus.MicroTokenizer'>
{'<pad>': 0, '<mask>': 1, '<bos>': 2, '<eos>': 3, 'g__Abarenicola': 4, 'g__Abedinium': 5, 'g__Abeoforma': 6, 'g__Aberranta': 7, 'g__Abies': 8, 'g__Abiotrophia': 9, 'g__Ablabesmyia': 10, 'g__Abolboda': 11, 'g__Abollifer': 12, 'g__Abraliopsis': 13, 'g__Absidia': 14, 'g__Abyssicoccus': 15, 'g__Abyssivirga': 16, 'g__Acanthamoeba': 17, 'g__Acanthephyra': 18, 'g__Acanthiella': 19, 'g__Acanthobdella': 20, 'g__Acanthobothrium': 21, 'g__Acanthocephalus': 22, 'g__Acanthoceras': 23, 'g__Acanthochiasma': 24, 'g__Acanthocolla': 25, 'g__Acanthocorbis': 26, 'g__Acanthocyclops': 27, 'g__Acanthocyrta': 28, 'g__Acanthodasys': 29, 'g__Acanthoeca': 30, 'g__Acantholeberis': 31, 'g__Acantholichen': 32, 'g__Acanthomacrostomum': 33, 'g__Acanthometra': 34, 'g__Acanthometron': 35, 'g__Acanthopharynx': 36, 'g__Acanthopleuribacter': 37, 'g__Acanthoptilum': 38, 'g__Acanthostaurus': 39, 'g__Acanthostigma': 40, 'g__Acaricomes': 41, 'g__Acaromyces': 42, 'g__Acaro