## 0. 导入依赖

In [1]:
import os
import pandas as pd
import numpy as np
import torch

from pickle import load, dump
from sklearn.preprocessing import OneHotEncoder
from transformers import (
    GPT2ForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_callback import EarlyStoppingCallback

from configparser import ConfigParser
from argparse import Namespace

## 1. 加载 / 设置配置（cfg）

In [2]:
cfg_path = "../MiCoGPT/resources/mgm_config.ini"

cfg = ConfigParser()
cfg.read(cfg_path)

print("[finetune] section:")
for k, v in cfg["finetune"].items():
    print(f"{k} = {v}")

input_corpus_path = "../data/try2_withCC/ResMicroDB_90338.pkl"
pretrained_model_path = "../models/pretrain_ResMicroDB_90338_BERT"
output_model_dir  = "../models/finetuned_model_ResMicroDB_90338_BERT"
log_dir           = "../logs/finetuned_ResMicroDB_90338_BERT"
val_split         = 0.2         # 验证集比例

args = Namespace(
    input=input_corpus_path,
    model=pretrained_model_path,
    output=output_model_dir,
    log=log_dir,
    val_split=val_split,
)

args


[finetune] section:
learning_rate = 1e-3
warmup_steps = 100
weight_decay = 0.001
per_device_train_batch_size = 64
num_train_epochs = 1000
logging_steps = 5


Namespace(input='../data/try2_withCC/ResMicroDB_90338.pkl', model='../models/pretrain_ResMicroDB_90338_BERT', output='../models/finetuned_model_ResMicroDB_90338_BERT', log='../logs/finetuned_ResMicroDB_90338_BERT', val_split=0.2)

In [3]:
from pickle import load
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder

# 1. 载入大 corpus，并筛选出要用于 finetune 的子集
all_corpus = load(open(args.input, "rb"))
tokenizer = all_corpus.tokenizer

# 子集：Split_Group == "A" 且 Is_Healthy 非 NA
corpus = all_corpus.subset_by_metadata(
    lambda df: (df["Split_Group"] == "A") & df["Is_Healthy"].notna()
)

print("Total samples in all_corpus:", len(all_corpus))
print("Samples used for finetune:", len(corpus))

meta = all_corpus.metadata
mask = (meta["Split_Group"] == "A") & meta["Is_Healthy"].notna()
print("Label distribution (A group, non-NA):")
print(meta.loc[mask, "Is_Healthy"].value_counts())

# 2. 从 metadata 中提取标签（基于 all_corpus）
labels_series = meta.loc[mask, "Is_Healthy"]   # index = sample_id

# 3. 将标签按 Subset corpus 的顺序对齐
indices = np.array(corpus.indices)   # 这些是 all_corpus 中的行号
sample_ids = np.array(all_corpus.sample_ids)[indices]

# 按 sample_id 顺序取标签，确保顺序和 corpus.__getitem__ 一致
labels = labels_series.loc[sample_ids]
labels = labels.to_frame(name="Is_Healthy")

print("corpus size:", len(corpus))
print("labels size:", len(labels))
print("First few labels:\n", labels.head())

# 4. label encoding（保持原来的 OneHotEncoder 流程）
le = OneHotEncoder()
labels_arr = le.fit_transform(labels.values.reshape(-1, 1)).toarray()
labels_tensor = torch.tensor(labels_arr.argmax(axis=1), dtype=torch.long)

num_labels = len(le.categories_[0])
print("num_labels:", num_labels)

# 5. 构造一个“全局标签数组”：和 all_corpus 对齐，其他位置填 -1
all_labels = np.full(len(all_corpus), fill_value=-1, dtype=int)
# corpus.indices 这部分是 finetune 子集，对应的标签是 labels_tensor
all_labels[indices] = labels_tensor.numpy()

# 简单 sanity check：在 finetune 子集的位置不应该再有 -1
assert (all_labels[indices] != -1).all()


Total samples in all_corpus: 90338
Samples used for finetune: 55575
Label distribution (A group, non-NA):
Is_Healthy
False    31073
True     24502
Name: count, dtype: int64
corpus size: 55575
labels size: 55575
First few labels:
           Is_Healthy
Run                 
CRR768228      False
CRR768229      False
CRR768230      False
CRR768231      False
CRR768232      False
num_labels: 2


In [4]:
# set model config
model = GPT2ForSequenceClassification.from_pretrained(
    args.model,
    num_labels=num_labels,
)

model

training_args_dict = {
    "learning_rate": cfg.getfloat("finetune", "learning_rate"),
    "do_train": True,
    "do_eval": True,
    "group_by_length": True,
    "length_column_name": "length",
    "disable_tqdm": False,
    "lr_scheduler_type": "linear",
    "warmup_steps": cfg.getint("finetune", "warmup_steps"),
    "weight_decay": cfg.getfloat("finetune", "weight_decay"),
    "per_device_train_batch_size": cfg.getint("finetune", "per_device_train_batch_size"),
    "num_train_epochs": cfg.getint("finetune", "num_train_epochs"),
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_steps": cfg.getint("finetune", "logging_steps"),
    "output_dir": f"{args.log}/finetune_checkpoints",
    "logging_dir": args.log,
    "load_best_model_at_end": True,
}

training_args = TrainingArguments(**training_args_dict)
training_args


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ../models/pretrain_ResMicroDB_90338_BERT and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=True,
half_precision_backend=auto,
hub_always_push=False,
hub_mode

In [5]:
import numpy as np
from torch.utils.data import Subset

def split_train_val_by_project(dataset, val_ratio=0.1, project_col="Project_ID", random_state=42):
    """
    按 project 划分 train / val：
    - 支持传入 MiCoGPTCorpus 或它的 Subset
    - 在“当前 dataset 所包含的样本集合”上，按 project 划分
    - 选出若干个 project 作为验证集
    - 这些 project 的样本总数 ≈ val_ratio * 当前 dataset 的样本数
    - 同一个 project 只会出现在 train 或 val 其中之一
    """

    # 1. 识别当前传入的是 corpus 本体还是 Subset
    if isinstance(dataset, Subset):
        base_corpus = dataset.dataset                    # 真正的 MiCoGPTCorpus
        base_indices = np.array(dataset.indices)         # 当前子集对应的“在 base_corpus 中的行号”
    else:
        base_corpus = dataset
        base_indices = np.arange(len(dataset))           # 整个 corpus 的所有行号

    # 2. 在 base_corpus.metadata 中取出“当前子集部分”的 metadata
    meta_full = base_corpus.metadata
    if meta_full is None:
        raise ValueError("base_corpus.metadata 为空，无法按 Project_ID 划分。")

    if project_col not in meta_full.columns:
        raise ValueError(f"metadata 中没有列 '{project_col}'，请检查列名。")

    # 只看当前子集的 metadata
    meta = meta_full.iloc[base_indices].copy()
    n_samples = meta.shape[0]
    target_val = int(n_samples * val_ratio)

    # 3. 取出当前子集中所有 project_id（去掉缺失值）
    project_ids = meta[project_col].to_numpy()
    # 去除 NaN（如果有的话）
    mask_not_nan = pd.notna(project_ids)
    project_ids_nonan = project_ids[mask_not_nan]

    unique_projects = np.unique(project_ids_nonan)

    # 4. 打乱 project 顺序
    rng = np.random.default_rng(random_state)
    rng.shuffle(unique_projects)

    # 5. 建立一个样本级别的布尔数组：is_val[i] 表示第 i 个样本是否进验证集
    is_val = np.zeros(n_samples, dtype=bool)
    val_projects = []
    val_count = 0

    for pid in unique_projects:
        if val_count >= target_val:
            break

        # 当前 project 对应的样本（在“当前子集中的局部索引”）
        proj_mask = (project_ids == pid)
        # 这个 project 在当前子集中有多少样本
        proj_size = proj_mask.sum()
        if proj_size == 0:
            continue  # 理论上不会，但防御一下

        # 把这个 project 全部丢进验证集
        is_val |= proj_mask
        val_projects.append(pid)
        val_count += proj_size

    # 6. 根据 is_val，映射回 base_corpus 的索引
    val_base_indices = base_indices[is_val]
    train_base_indices = base_indices[~is_val]

    # 7. 构造最终子集
    train_set = Subset(base_corpus, train_base_indices.tolist())
    val_set   = Subset(base_corpus, val_base_indices.tolist())

    # 8. 打印真正的数量（一定和 len(val_set) 一致）
    print(
        f"[split_by_project] 选中 {len(val_projects)} 个 project 作为验证集，"
        f"验证样本数 {len(val_set)}，目标约 {target_val}，当前 dataset 样本数 {n_samples}"
    )
    print(f"[split_by_project] Train samples: {len(train_set)}, Val samples: {len(val_set)}")

    return train_set, val_set



In [6]:
from torch.utils.data import Dataset

class FinetuneDataset(Dataset):
    def __init__(self, base_corpus, indices, labels_array):
        self.base_corpus = base_corpus
        self.indices = np.array(indices)
        self.labels = np.array(labels_array, dtype=int)

        assert len(self.indices) == len(self.labels), \
            "indices 和 labels_array 的长度必须一致"

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        base_idx = int(self.indices[idx])
        item = self.base_corpus[base_idx]  # {'input_ids', 'attention_mask'}

        # 加 labels
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        # 为了配合 TrainingArguments 里的 group_by_length / length_column_name="length"
        # 用 attention_mask 中非 pad 的 token 数目当作长度
        if "attention_mask" in item:
            # attention_mask 是 0/1 tensor
            item["length"] = int(item["attention_mask"].sum().item())
        else:
            # 保险：如果将来改了 __getitem__，至少不至于 KeyError
            item["length"] = int((item["input_ids"] != 0).sum().item())

        return item



In [7]:
print("Start training...")
model = model.train()

split = args.val_split  # 例如 0.2

# 1. 在当前 finetune 子集（corpus）范围内，按 Project_ID 分组划分
train_subset, val_subset = split_train_val_by_project(
    corpus,
    val_ratio=split,
    project_col="Project_ID",    # 如果列名不同，这里改掉
    random_state=42,
)

print(f"Train samples (subset): {len(train_subset)}, Val samples (subset): {len(val_subset)}")

# 2. 提取对应的全局索引（在 all_corpus 中的位置）
train_idx = np.array(train_subset.indices)
val_idx   = np.array(val_subset.indices)

# 3. 从全局标签数组中取出对应标签
train_labels = all_labels[train_idx]
val_labels   = all_labels[val_idx]

# 做个 sanity check：这里不应该有 -1
assert (train_labels != -1).all()
assert (val_labels != -1).all()

print("Train labels value counts:", pd.Series(train_labels).value_counts())
print("Val labels value counts:", pd.Series(val_labels).value_counts())

# 4. 构造真正给 Trainer 用的 Dataset（带 labels）
train_dataset = FinetuneDataset(
    base_corpus=all_corpus,
    indices=train_idx,
    labels_array=train_labels,
)
val_dataset = FinetuneDataset(
    base_corpus=all_corpus,
    indices=val_idx,
    labels_array=val_labels,
)

print("Train dataset size:", len(train_dataset))
print("Val dataset size:", len(val_dataset))

# 5. Trainer
callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=callbacks,
)

trainer.train()

# 保存模型
os.makedirs(args.output, exist_ok=True)
trainer.save_model(args.output)

# 保存 label encoder
dump(le, open(os.path.join(args.output, "label_encoder.pkl"), "wb"))
print(f"Model and label encoder saved to: {args.output}")

# 保存日志
logs = trainer.state.log_history
logs_df = pd.DataFrame(logs)

os.makedirs(args.log, exist_ok=True)
log_path = os.path.join(args.log, "finetune_log.csv")
logs_df.to_csv(log_path, index=False)

print(f"Training logs saved to: {log_path}")
logs_df.tail()


Start training...
[split_by_project] 选中 47 个 project 作为验证集，验证样本数 11272，目标约 11115，当前 dataset 样本数 55575
[split_by_project] Train samples: 44303, Val samples: 11272
Train samples (subset): 44303, Val samples (subset): 11272
Train labels value counts: 0    25302
1    19001
Name: count, dtype: int64
Val labels value counts: 0    5771
1    5501
Name: count, dtype: int64
Train dataset size: 44303
Val dataset size: 11272


  return {'input_ids': torch.tensor(tokens),


Epoch,Training Loss,Validation Loss
1,0.1974,0.821553
2,0.1457,0.931236
3,0.1455,1.005533
4,0.1606,1.060238
5,0.1182,1.078619
6,0.1304,1.227279
7,0.1082,1.629533
8,0.0921,1.255509
9,0.0992,1.384293
10,0.1326,0.969523


  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),


Model and label encoder saved to: ../models/finetuned_model_ResMicroDB_90338_BERT
Training logs saved to: ../logs/finetuned_ResMicroDB_90338_BERT/finetune_log.csv


Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
1531,0.075,0.000989,10.98,7610,,,,,,,,,
1532,0.1101,0.000989,10.99,7615,,,,,,,,,
1533,0.126,0.000989,11.0,7620,,,,,,,,,
1534,,,11.0,7623,1.090912,19.5115,577.709,72.214,,,,,
1535,,,11.0,7623,,,,,2445.0091,18119.769,283.435,9460248000000000.0,0.11982
