## 0. 导入依赖（保持和原 predict 一致）

In [4]:
import os
import pandas as pd

import torch
from pickle import load

from transformers import Trainer, GPT2ForSequenceClassification

import sys
sys.path.append(os.path.abspath(".."))  # 从 notebooks/ 回到外面的 MiCoGPT 根目录
from MiCoGPT.utils.mgm_CLI_utils import find_pkg_resource
from MiCoGPT.utils.corpus import (
    MicroCorpus,
    MiCoGPTokenizer,
    SequenceClassificationDataset,
)
from MiCoGPT.utils.mgm_utils import eval_and_save


# 用来在 notebook 里模拟 CLI 的 args
from argparse import Namespace
from configparser import ConfigParser

## 1. 在 Notebook 里手动设置 cfg 和 args（代替命令行）

In [8]:
cfg = ConfigParser()
cfg.read("config.ini")  # 如果没有可以先跳过或改成其他路径

# 手动构造一个等价于命令行的 args 对象
args = Namespace(
    input="../data/try2_withCC/abundance_B_13901.pkl",
    labels="../data/try2_withCC/flag_B_13901.csv",
    model="../models/finetuned_model_A_subset_55575",
    output="../outputs/predict",
    evaluate=True
)

args

Namespace(input='../data/try2_withCC/abundance_B_13901.pkl', labels='../data/try2_withCC/flag_B_13901.csv', model='../models/finetuned_model_A_subset_55575', output='../outputs/predict', evaluate=True)

## 2. 加载 MicroCorpus 和 tokenizer

In [6]:
corpus = load(open(args.input, "rb"))
tokenizer = corpus.tokenizer

print("样本数量:", len(corpus))
display(corpus.data.head())

样本数量: 13901


#SampleID,g__Stenotrophomonas,g__Bacteriovorax,g__Idiomarina,g__Eubacterium,g__Methylobacillus,g__Larkinella,g__Fonticella,g__Klebsiella,g__Merdibacter,g__Fibrobacter,...,g__Chujaibacter,g__Papillibacter,g__Tannerellaceae,g__Sporichthya,g__Sphingosinicella,g__Salinivibrio,g__Aquaspirillum,g__Methylibium,g__Austwickia,g__Oceanotoga
DRR452457,-0.121598,-0.066191,-0.014152,-0.014765,-0.03292,-0.01434,-0.018529,-0.047352,-0.031506,-0.016866,...,-0.015618,-0.021123,-0.014295,-0.017187,-0.048861,-0.043702,-0.021391,-0.032084,-0.023196,-0.020422
DRR452458,-0.039842,-0.066191,-0.014152,-0.014765,-0.03292,-0.01434,-0.018529,-0.047352,-0.031506,-0.016866,...,-0.015618,-0.021123,-0.014295,-0.017187,-0.048861,-0.043702,-0.021391,-0.032084,-0.023196,-0.020422
DRR452459,-0.121598,-0.066191,-0.014152,-0.014765,-0.03292,-0.01434,-0.018529,-0.047352,-0.031506,-0.016866,...,-0.015618,-0.021123,-0.014295,-0.017187,-0.048861,-0.043702,-0.021391,-0.032084,-0.023196,-0.020422
DRR452460,-0.107449,-0.066191,-0.014152,-0.014765,-0.03292,-0.01434,-0.018529,-0.047352,-0.031506,-0.016866,...,-0.015618,-0.021123,-0.014295,-0.017187,-0.048861,-0.043702,-0.021391,-0.032084,-0.023196,-0.020422
DRR452461,-0.121598,-0.066191,-0.014152,-0.014765,-0.03292,-0.01434,-0.018529,-0.047352,-0.031506,-0.016866,...,-0.015618,-0.021123,-0.014295,-0.017187,-0.048861,-0.043702,-0.021391,-0.032084,-0.023196,-0.020422


## 3. 根据是否 evaluate 处理 label，并构建 Dataset

In [9]:
if args.evaluate:
    # 必须提供 labels（和原 predict 一致）
    if args.labels is None:
        raise ValueError("Please provide labels for evaluation.")
    else:
        # 读取标签文件，index 是 sample ID
        labels = pd.read_csv(args.labels, index_col=0)

        # 检查样本 ID 是否对齐
        if set(corpus.data.index) != set(labels.index):
            print(
                "Warning: the sample IDs in the abundance table and the metadata table are not the same.\n"
                "The samples in the metadata table but not in the abundance table will be removed.\n"
                "This may happened because some samples may have all zero counts during the preprocessing "
                "of the abundance table."
            )

        # 按 corpus 的顺序对齐标签
        labels = labels.loc[corpus.data.index]

        # 载入训练阶段保存的 label encoder
        le = load(open(f"{args.model}/label_encoder.pkl", "rb"))

        # one-hot 编码后再取 argmax 得到类别 index
        labels_array = le.transform(labels.values.reshape(-1, 1)).toarray()
        labels_tensor = torch.tensor(labels_array.argmax(axis=1))

        # 构建带标签的 Dataset
        dataset = SequenceClassificationDataset(
            corpus[:]["input_ids"],
            corpus[:]["attention_mask"],
            labels_tensor,
        )
else:
    # 只预测，不做 evaluation
    print(
        "Only predict the labels, no evaluation. "
        "Please pay attention to the threshold for manual evaluation."
    )

    # 依然需要 label encoder（用于列名等）
    le = load(open(f"{args.model}/label_encoder.pkl", "rb"))

    # 占位标签，全 0，长度与 corpus 一致
    dummy_labels = [0] * len(corpus)

    dataset = SequenceClassificationDataset(
        corpus[:]["input_ids"],
        corpus[:]["attention_mask"],
        dummy_labels,
    )

print("Dataset 大小:", len(dataset))
print("类别数:", len(le.categories_[0]))


FileNotFoundError: [Errno 2] No such file or directory: '../models/finetuned_model_A_subset_55575/label_encoder.pkl'

## 4. 加载模型并构建 Trainer

In [10]:
num_labels = len(le.categories_[0])

model = GPT2ForSequenceClassification.from_pretrained(
    args.model,
    num_labels=num_labels
)

model.eval()  # 进入 eval 模式

trainer = Trainer(model=model)
model

NameError: name 'le' is not defined

## 5. 运行预测、保存 y_score、可选 evaluation

In [None]:
# Cell 6: 预测并保存结果（y_score.csv & evaluation）

# 运行预测
predictions = trainer.predict(dataset)

# 确保输出目录存在
if not os.path.exists(args.output):
    os.makedirs(args.output, exist_ok=True)

# 预测得分矩阵（样本数 × 类别数）
y_score = predictions.predictions

# 保存为 csv，index 对齐 corpus.data.index，列名来自 label encoder
score_path = os.path.join(args.output, "y_score.csv")
pd.DataFrame(
    y_score,
    index=corpus.data.index,
    columns=le.categories_[0]
).to_csv(score_path)

print("y_score 已保存到:", score_path)

# 如果需要 evaluation，就计算并保存
if args.evaluate:
    y_true = predictions.label_ids  # 真实标签（Trainer 返回）
    eval_dir = os.path.join(args.output, "evaluation")

    eval_and_save(
        y_score,
        y_true,
        le.categories_[0],
        eval_dir,
    )

    print("evaluation 结果已保存到:", eval_dir)
