# MiCoGPT v2.0 Fine-tuning (Sequence Classification)

本 Notebook 演示如何加载预训练好的 `MiCoGPT v2.0` 模型 (vCross 版)，并在下游任务 (例如疾病预测/二分类) 上进行微调。

**主要流程：**
1. 加载 `vCross` 版本的多模态语料库。
2. 筛选目标子集 (例如 Split_Group=A) 并准备分类标签。
3. 加载预训练模型，并转换为分类模型 (`MiCoGPTForSequenceClassification`)。
4. 使用自定义的 `MiCoGPTClassificationCollator` 处理多模态输入和分类标签。
5. 训练并评估。

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from pickle import load as pkl_load
from argparse import Namespace
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from torch.utils.data import Subset
from transformers import Trainer, TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback

# --- v2.0 自定义模块 ---
from MiCoGPT.utils_vCross.model_vCross import MiCoGPTConfig, MiCoGPTForSequenceClassification
from MiCoGPT.utils_vCross.collator_vCross import MiCoGPTClassificationCollator

# --- 复用工具函数 ---
from MiCoGPT.utils.finetune import prepare_labels_for_subset, split_train_val_by_project_stratified_with_labels
from MiCoGPT.utils.finetune_v2 import SubsetWithLabels

import warnings
warnings.filterwarnings("ignore")

## 1. 配置参数 (Configuration)

设置输入数据路径、预训练模型路径和微调参数。

In [None]:
args = Namespace(
    # 输入语料库 (vCross 格式)
    input="../data/vCross/ResMicroDB_90338_vCross.pkl",
    
    # 预训练模型路径 (从 checkpoint 加载)
    pretrained_model="../models/pretrain_vCross_base",
    
    # 输出目录
    output="../models/finetuned_vCross_base",
    log="../logs/finetuned_vCross_base",
    
    # 任务相关
    label_col="Is_Healthy",     # 预测标签列
    split_group="A",            # 仅使用 Split_Group A 进行微调
    val_ratio=0.2,              # 验证集比例
)

print("Args:", args)

## 2. 加载语料库 (Load Corpus)

读取 `MiCoGPTCorpus_vCross` 对象，包含多模态数据 (Token IDs, Value IDs, Condition IDs) 和元数据 (Metadata)。

In [None]:
print(f"Loading corpus from {args.input} ...")
with open(args.input, "rb") as f:
    corpus = pkl_load(f)
    
print(f"Loaded corpus with {len(corpus)} samples.")
print("Tokenizer vocab size:", corpus.tokenizer.vocab_size)

## 3. 数据准备 (Data Preparation)

1. **筛选子集**: 选择 `Split_Group == A` 且标签存在的样本。
2. **生成标签**: 将文本标签 (如 True/False) 转换为数字 ID。
3. **划分数据集**: 按 Project 对应的 Study 进行分层划分，防止数据泄漏。

In [None]:
# 1. 筛选子集
meta = corpus.metadata
valid_mask = (meta["Split_Group"] == args.split_group) & (meta[args.label_col].notna())
finetune_indices = np.where(valid_mask)[0]
finetune_subset = Subset(corpus, finetune_indices)

print(f"Filtered subset size: {len(finetune_subset)}")

# 2. 生成标签 (prepare_labels_for_subset 来自 MiCoGPT.utils.finetune)
# 它会返回一个与 corpus 长度对齐的 all_labels 数组 (非 subset 位置为 -1)
labels_tensor, all_labels_array, le, num_labels = prepare_labels_for_subset(
    all_corpus=corpus,
    subset=finetune_subset,
    label_col=args.label_col,
    verbose=True
)

# 3. 划分训练/验证集
train_subset, val_subset = split_train_val_by_project_stratified_with_labels(
    finetune_subset,
    label_col=args.label_col,
    val_ratio=args.val_ratio,
    project_col="Project_ID"
)

print(f"Train size: {len(train_subset)}")
print(f"Val size: {len(val_subset)}")

In [None]:
# 4. 包装成带 Labels 的 Dataset
def create_dataset_with_labels(subset, all_labels_array):
    # subset.indices 是在 corpus 中的全局索引
    indices = subset.indices
    # 从全局 labels 数组中提取对应标签
    subset_labels = torch.tensor(all_labels_array[indices], dtype=torch.long)
    return SubsetWithLabels(subset, subset_labels)

train_ds = create_dataset_with_labels(train_subset, all_labels_array)
val_ds = create_dataset_with_labels(val_subset, all_labels_array)

# 检查一个样本
sample = train_ds[0]
print("Sample keys:", sample.keys())
print("Label:", sample["labels"])

## 4. 加载模型 (Load Model)

加载预训练的 `vCross` 模型，并实例化为 `MiCoGPTForSequenceClassification`。
注意：
- 我们需要指定 `num_labels`。
- `ignore_mismatched_sizes=True` 是必须的，因为分类头 (Score Layer) 是新初始化的，与预训练的 LM Head 尺寸不同。

In [None]:
# 加载配置
config = MiCoGPTConfig.from_pretrained(args.pretrained_model)
config.num_labels = num_labels
print("Model Config:", config)

# 加载模型
model = MiCoGPTForSequenceClassification.from_pretrained(
    args.pretrained_model,
    config=config,
    ignore_mismatched_sizes=True
)

# 打印模型结构，确认多模态 Embedding 和分类头存在
print(model)

## 5. 训练 (Training)

使用 HuggingFace Trainer 进行微调。
- 使用 `MiCoGPTClassificationCollator` 处理数据。
- 定义评估指标 (Accuracy, F1)。

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        "accuracy": acc,
        "f1": f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir=args.output,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    
    learning_rate=2e-5,
    weight_decay=0.01,
    
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    
    logging_dir=args.log,
    logging_steps=50,
    
    # NFS 优化
    dataloader_num_workers=0,
    
    # 显存优化 (防止 Evaluation OOM)
    # eval_accumulation_steps=60,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    # 使用自定义的分类 Collator
    data_collator=MiCoGPTClassificationCollator(corpus.tokenizer, max_length=512),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

## 6. 保存与分析 (Save & Analyze)

保存微调后的模型、Tokenizer 以及训练日志。

In [None]:
# 1. 保存模型
trainer.save_model(args.output)
corpus.tokenizer.save_pretrained(args.output)

# 保存 Label Encoder (关键)
import joblib
joblib.dump(le, f"{args.output}/label_encoder.joblib")
print(f"Model and LabelEncoder saved to {args.output}")

# 2. 导出日志
log_history = trainer.state.log_history
with open(f"{args.output}/training_logs.json", "w") as f:
    json.dump(log_history, f, indent=2)

# 3. 绘制曲线
train_steps = [x["step"] for x in log_history if "loss" in x]
train_loss = [x["loss"] for x in log_history if "loss" in x]
eval_steps = [x["step"] for x in log_history if "eval_loss" in x]
eval_loss = [x["eval_loss"] for x in log_history if "eval_loss" in x]
eval_f1 = [x["eval_f1"] for x in log_history if "eval_f1" in x]

plt.figure(figsize=(12, 5))

# Loss 曲线
plt.subplot(1, 2, 1)
if train_steps: plt.plot(train_steps, train_loss, label="Train Loss", alpha=0.6)
if eval_steps: plt.plot(eval_steps, eval_loss, label="Val Loss", marker="o")
plt.title("Loss Curve")
plt.legend()

# F1 曲线
plt.subplot(1, 2, 2)
if eval_steps and eval_f1: 
    plt.plot(eval_steps, eval_f1, label="Val F1", color="orange", marker="s")
plt.title("F1 Score Curve")
plt.legend()

plt.savefig(f"{args.output}/training_curve.png")
plt.show()

## 7.5. (可选) 重载模型 (Reload Model)

如果内存已清空，可以运行此 Cell 重载模型。

In [None]:
# 0. (可选) 重新加载模型和配置

import joblib
from MiCoGPT.utils_vCross.model_vCross import MiCoGPTConfig, MiCoGPTForSequenceClassification

# 检查变量是否存在，如果不存在则加载
if 'trainer' not in locals() or 'model' not in locals() or 'le' not in locals():
    print(f"Loading model from {args.output} ...")
    
    # 1. 加载 Label Encoder
    le_path = f"{args.output}/label_encoder.joblib"
    if os.path.exists(le_path):
        le = joblib.load(le_path)
        print("Label Encoder loaded.")
    else:
        raise FileNotFoundError(f"Label Encoder not found at {le_path}. Please run training first.")
    
    # 2. 加载模型
    if hasattr(le, "classes_"):
        num_labels = len(le.classes_)
    else:
        num_labels = len(le.categories_[0])
        
    config = MiCoGPTConfig.from_pretrained(args.output)
    model = MiCoGPTForSequenceClassification.from_pretrained(args.output, config=config)
    
    # 3. 准备 Collator
    if 'MiCoGPTClassificationCollator' not in locals():
        from MiCoGPT.utils_vCross.collator_vCross import MiCoGPTClassificationCollator
        
    if 'corpus' in locals():
        tokenizer = corpus.tokenizer
    else:
        print("Warning: corpus not in memory. Using tokenizer from saved model.")
        raise ValueError("Please run the 'Load Corpus' cell above to load the tokenizer first.")
        
    data_collator = MiCoGPTClassificationCollator(tokenizer)
    
    # 4. 重建 Trainer
    trainer = Trainer(
        model=model,
        args=TrainingArguments(output_dir=args.output, per_device_eval_batch_size=32),
        data_collator=data_collator
    )
    print("Model and Trainer restored.")
else:
    print("Model and Trainer already in memory. Skipping reload.")

## 8. 预测 (Prediction on Split B)

使用微调后的模型对测试集 (Split_Group='B') 进行预测，并调用 `eval_and_save` 计算多项指标。

In [None]:
from MiCoGPT.utils.mgm_utils import eval_and_save

# 1. 准备测试集 (Split_Group = B)
print("Preparing Test Set (Split_Group='B')...")
meta = corpus.metadata
test_mask = (meta["Split_Group"] == "B") & (meta[args.label_col].notna())
test_indices = np.where(test_mask)[0]
test_subset = Subset(corpus, test_indices)

print(f"Test subset size: {len(test_subset)}")

# 2. 生成测试集标签 (复用训练时的 Encoder)
# 注意：这里必须传入训练时 fit 好的 le (encoder)，以保证标签 ID 映射一致
test_labels_tensor, test_all_labels, _, _ = prepare_labels_for_subset(
    all_corpus=corpus,
    subset=test_subset,
    label_col=args.label_col,
    encoder=le, 
    verbose=True
)

# 3. 包装 Dataset
test_ds = create_dataset_with_labels(test_subset, test_all_labels)

# 4. 预测
print("Running prediction...")
# Trainer 会自动使用最佳模型 (load_best_model_at_end=True)
predictions = trainer.predict(test_ds)
y_score = predictions.predictions
y_true = predictions.label_ids

# 5. 评估并保存
# 获取类别名称
if hasattr(le, "categories_"):
    # OneHotEncoder
    label_names = list(le.categories_[0])
else:
    # LabelEncoder
    label_names = [str(c) for c in le.classes_]

save_dir = f"{args.output}/prediction_B"
os.makedirs(save_dir, exist_ok=True)

print(f"Saving results to {save_dir}...")
eval_and_save(
    y_score=y_score,
    y_true=y_true,
    label_names=label_names,
    save_dir=save_dir,
    activation="softmax" # 模型输出 logits，需要 softmax 归一化
)