## 0. 导入依赖

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import random_split

import sys
sys.path.append(os.path.abspath(".."))  # 从 notebooks/ 回到外面的 MiCoGPT 根目录
from MiCoGPT.utils.corpus import (
    SequenceClassificationDataset,
)

from pickle import load, dump
from sklearn.preprocessing import OneHotEncoder
from transformers import (
    GPT2ForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_callback import EarlyStoppingCallback

from configparser import ConfigParser
from argparse import Namespace

## 1. 加载 / 设置配置（cfg）

In [None]:
cfg_path = "../MiCoGPT/resources/mgm_config.ini"

cfg = ConfigParser()
cfg.read(cfg_path)

print("[finetune] section:")
for k, v in cfg["finetune"].items():
    print(f"{k} = {v}")

[finetune] section:
learning_rate = 1e-3
warmup_steps = 100
weight_decay = 0.001
per_device_train_batch_size = 64
num_train_epochs = 1000
logging_steps = 5


## 2. 设置参数 args

In [None]:
input_corpus_path = "../data/try2_withCC/ResMicroDB_90338.pkl"
# labels_csv_path   = "../data/try2_withCC/flag_A_subgroup_55575.csv"
pretrained_model_path = "../models/pretrain_ResMicroDB_90338"
output_model_dir  = "../models/finetuned_model_ResMicroDB_90338"
log_dir           = "../logs/finetuned_ResMicroDB_90338"
val_split         = 0.2         # 验证集比例

args = Namespace(
    input=input_corpus_path,
    # labels=labels_csv_path,
    model=pretrained_model_path,
    output=output_model_dir,
    log=log_dir,
    val_split=val_split,
)

args

Namespace(input='../data/try2_withCC/abundance_A_subgroup_55575.pkl', labels='../data/try2_withCC/flag_A_subgroup_55575.csv', model='../models/pretrain_A_74557', output='../models/finetuned_model_A_subset_55575', log='../logs', val_split=0.2)

## 3. 加载语料与标签，检查样本 ID 一致性

In [None]:
corpus = load(open(args.input, "rb"))
tokenizer = corpus.tokenizer

labels = pd.read_csv(args.labels, index_col=0)

# 样本 ID 对齐检查
if set(corpus.data.index) != set(labels.index):
    print(
        "Warning: the sample IDs in the abundance table and the metadata table are not the same."
        "The samples in the metadata table but not in the abundance table will be removed."
        "This may happened because some samples were removed or had all zero counts during the preprocessing of the abundance table."
    )

# 只保留与 corpus 一致的样本顺序
labels = labels.loc[corpus.data.index]

print("corpus size:", len(corpus))
print("labels size:", len(labels))
print("First few labels:\n", labels.head())

corpus size: 55575
labels size: 55575
First few labels:
            Is_Healthy
CRR768228       False
CRR768229       False
CRR768230       False
CRR768231       False
CRR768232       False


## 4. 标签编码（OneHotEncoder → class index）

In [23]:
# label encoding（完全照原始代码）
le = OneHotEncoder()
labels_arr = le.fit_transform(labels.values.reshape(-1, 1)).toarray()
labels_tensor = torch.tensor(labels_arr.argmax(axis=1))

num_labels = len(le.categories_[0])
print("num_labels:", num_labels)

num_labels: 2


## 5. 打包成 Dataset（SequenceClassificationDataset）

In [24]:
# 对应原始代码中的 SequenceClassificationDataset 封装
dataset = SequenceClassificationDataset(
    corpus[:]["input_ids"],
    corpus[:]["attention_mask"],
    labels_tensor,
)

len(dataset), dataset[0]

  tokens = self.tokens[index].clone()
  def __getitem__(self, idx):
  return {
  "input_ids": torch.tensor(self.seq[idx]),


(55575,
 {'input_ids': tensor([1119,  710,  397,  474,  675,  368,  191,  701,   20,  632,  138,  357,
           343,  490,  466,  307,  102,  403,   61,  791,  625,  672,  127,  156,
           267,  168,  203,  746,   60,  450,  749,   10,  380,  384,  178,  269,
           591,  524,  538,  220,  500,  136,  719,  768,  786,   48,  405,  145,
           301,  296,   63,  562,  270,  531,  104,  426,  717,  642,  130,  311,
            56,  299,  517,  142,  568,  595,  144,  371,  681,  155,  381,  383,
            41,  328,  437,  309,  321,  794,  670,  506,  648,  360,  470,  736,
           443,  495,  565,  364,  448,  576,  124,  604,   79,  223,  482,  282,
           365,  205,  339,  480,  527,    9,  501,  588,  489,    2,  430,  782,
           358,  756,  486,  751,  572,  345,  342,  278,  526,  505,  333,   90,
           460,  599,   36,  372,  512,  396,  618,  305,  359,   84,  481, 1120,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  

## 6. 加载预训练模型（GPT2ForSequenceClassification）

In [25]:
# set model config
model = GPT2ForSequenceClassification.from_pretrained(
    args.model,
    num_labels=num_labels,
)

model

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ../models/pretrain_A_74557 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(1121, 256)
    (wpe): Embedding(512, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=256, out_features=2, bias=False)
)

## 7. 构造 TrainingArguments（从 cfg 读取超参）

In [26]:
training_args_dict = {
    "learning_rate": cfg.getfloat("finetune", "learning_rate"),
    "do_train": True,
    "do_eval": True,
    "group_by_length": True,
    "length_column_name": "length",
    "disable_tqdm": False,
    "lr_scheduler_type": "linear",
    "warmup_steps": cfg.getint("finetune", "warmup_steps"),
    "weight_decay": cfg.getfloat("finetune", "weight_decay"),
    "per_device_train_batch_size": cfg.getint("finetune", "per_device_train_batch_size"),
    "num_train_epochs": cfg.getint("finetune", "num_train_epochs"),
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_steps": cfg.getint("finetune", "logging_steps"),
    "output_dir": f"{args.log}/finetune_checkpoints",
    "logging_dir": args.log,
    "load_best_model_at_end": True,
}

training_args = TrainingArguments(**training_args_dict)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=True,
half_precision_backend=auto,
hub_always_push=False,
hub_mode

## 8. 划分训练 / 验证集 + 构建 Trainer

In [27]:
print("Start training...")
model = model.train()

split = args.val_split

train_size = int(len(corpus) * (1 - split))
val_size = len(corpus) - train_size  # 保证两者之和等于总长度
train_set, val_set = random_split(dataset, [train_size, val_size])

print(f"train_size = {train_size}, val_size = {val_size}")

callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    callbacks=callbacks,
)

trainer

Start training...
train_size = 44460, val_size = 11115


<transformers.trainer.Trainer at 0x172d9d630>

## 9. 开始训练 + 保存模型和 label encoder

In [None]:
trainer.train()

# 保存模型
os.makedirs(args.output, exist_ok=True)
trainer.save_model(args.output)

# 保存 label encoder
dump(le, open(os.path.join(args.output, "label_encoder.pkl"), "wb"))
print(f"Model and label encoder saved to: {args.output}")

## 10. 保存训练日志

In [None]:
logs = trainer.state.log_history
logs_df = pd.DataFrame(logs)

os.makedirs(args.log, exist_ok=True)
log_path = os.path.join(args.log, "finetune_log.csv")
logs_df.to_csv(log_path, index=False)

print(f"Training logs saved to: {log_path}")
logs_df.tail()