## 0. 导入依赖

In [1]:
import os
import pandas as pd
import numpy as np
from pickle import load, dump
from transformers import (
    Trainer,
    TrainingArguments,
)
from importlib.resources import files
from transformers.trainer_callback import EarlyStoppingCallback
from MiCoGPT.utils.finetune import split_train_val_by_project_stratified_with_labels, prepare_labels_for_subset, load_gpt2_cls_manual, print_gated_stats, FinetuneDataset
from configparser import ConfigParser
from argparse import Namespace

## 1. 加载 / 设置配置（cfg）

In [None]:
cfg_path = "../MiCoGPT/resources/config.ini"

cfg = ConfigParser()
cfg.read(cfg_path)

print("[finetune] section:")
for k, v in cfg["finetune"].items():
    print(f"{k} = {v}")

input_corpus_path = "../data/try2_withCC/ResMicroDB_90338.pkl"
pretrained_model_path = "../models/pretrain_ResMicroDB_90338_GATED_high_init_high_scale"
output_model_dir  = "../models/finetuned_ResMicroDB_90338_GATED_high_init_high_scale"
log_dir           = "../logs/finetuned_ResMicroDB_90338_GATED_high_init_high_scale"
val_split         = 0.2         # 验证集比例

args = Namespace(
    input=input_corpus_path,
    model=pretrained_model_path,
    output=output_model_dir,
    log=log_dir,
    val_split=val_split,
)

args


[finetune] section:
learning_rate = 1e-4
warmup_steps = 100
weight_decay = 0.001
per_device_train_batch_size = 64
num_train_epochs = 1000
logging_steps = 5


Namespace(input='../data/try2_withCC/ResMicroDB_90338.pkl', model='../models/pretrain_ResMicroDB_90338_GATED_base', output='../models/finetuned_ResMicroDB_90338_GATED_base', log='../logs/finetuned_ResMicroDB_90338_GATED_base', val_split=0.2)

## 加载语料

In [3]:
all_corpus = load(open(args.input, "rb"))
tokenizer = all_corpus.tokenizer

# 你想作为微调集合的样本（例：Split_Group == "A" 且 Is_Healthy 非空）
finetune_subset = all_corpus.subset_by_metadata(
    lambda df: (df["Split_Group"] == "A") & df["Is_Healthy"].notna()
)

print("Number of samples in all_corpus:", len(all_corpus))
print("Number of samples in finetune_subset:", len(finetune_subset))
print(all_corpus.metadata["Split_Group"].value_counts())

Number of samples in all_corpus: 90338
Number of samples in finetune_subset: 55575
Split_Group
A    74557
B    13901
C     1880
Name: count, dtype: int64


## 生成标签

In [4]:
labels_tensor, all_labels, le, num_labels = prepare_labels_for_subset(
    all_corpus=all_corpus,
    subset=finetune_subset,
    label_col="Is_Healthy",
    verbose=True,
)

[labels] subset size=55575
[labels] num_labels=2
[labels] distribution:
0    31073
1    24502
Name: count, dtype: int64


In [5]:
MODE = "gated"   # 或者 "vanilla"/"gated"

npz_path = files("MiCoGPT") / "resources" / "genus_embeddings_256.npz"

model, device = load_gpt2_cls_manual(
    model_dir=args.model,
    num_labels=num_labels,
    mode=MODE,
    tokenizer=tokenizer if MODE == "gated" else None,
    npz_path=npz_path if MODE == "gated" else None,
    g_min=0.0,
    init_w=0.1,
)
model.train()
if MODE == "gated":
    print_gated_stats(model, tokenizer=tokenizer, npz_path=npz_path)

model


[prior] npz genus: 1117
[prior] prior unique token_id: 1117
[prior] missing genus: 0
[prior] applied AUTO scale s=0.2939 to prior_matrix (p50 align)
[load:gated] missing_keys=1, unexpected_keys=1
missing keys: ['score.weight']
unexpected keys: ['lm_head.weight']
[gated-stats] vocab=1121, prior_nonzero_rows=1117, g_min=0.0
[gated-stats] base_norm  p10/p50/p90 = 2.1692 / 2.5882 / 2.9523
[gated-stats] prior_norm p10/p50/p90 = 0.2478 / 0.3209 / 0.4808
[gated-stats] current p50-align ratio (base/prior) = 8.0644  (≈1 means already aligned)
[gated-stats] gate_logits mean/min/max = -2.0937 / -2.9698 / -0.7861
[gated-stats] w_all      mean/min/max = 0.1126 / 0.0488 / 0.3130
[gated-stats] w_all      p10/p50/p90 = 0.0799 / 0.1087 / 0.1501
[gated-stats] w_genus    mean/min/max = 0.1127 / 0.0488 / 0.3130
[gated-stats] w_genus    p10/p50/p90 = 0.0799 / 0.1088 / 0.1502
[gated-stats] est scale vs raw-npz (median of norm ratios) = 0.2956
[gated-stats] scale ratio p10/p50/p90 = 0.2956 / 0.2956 / 0.2956


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): GatedPriorEmbedding(
      (base): Embedding(1121, 256)
    )
    (wpe): Embedding(512, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=256, out_features=2, bias=False)
)

In [6]:
training_args_dict = {
    "learning_rate": cfg.getfloat("finetune", "learning_rate"),
    "do_train": True,
    "do_eval": True,
    "group_by_length": False,
    # "length_column_name": "length",
    "disable_tqdm": False,
    "lr_scheduler_type": "linear",
    "warmup_steps": cfg.getint("finetune", "warmup_steps"),
    "weight_decay": cfg.getfloat("finetune", "weight_decay"),
    "per_device_train_batch_size": cfg.getint("finetune", "per_device_train_batch_size"),
    "num_train_epochs": cfg.getint("finetune", "num_train_epochs"),
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_steps": cfg.getint("finetune", "logging_steps"),
    "output_dir": f"{args.log}/finetune_checkpoints",
    "logging_dir": args.log,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
}

training_args = TrainingArguments(**training_args_dict)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_mod

## 划分训练集和验证集

In [7]:
train_subset, val_subset = split_train_val_by_project_stratified_with_labels(
    finetune_subset,
    label_col="Is_Healthy",
    project_col="Project_ID",
    val_ratio=args.val_split,
    min_project_samples=20,
    min_val_per_project=2,
    random_state=42,
    label_balance_strength=1.0,  # 先用 1.0；想更强拉平就 2.0；不管标签就 0
)


[split] total_samples=55575, target_val~11115
[split] eligible_projects=251, eligible_samples=55398
[split] ineligible_projects=16, ineligible_samples=177
[split] label_dist (overall):
Is_Healthy
False    31073
True     24502
Name: count, dtype: int64
[split] actual_val=11115 (target~11115), train=44460
[split] label_dist (val):
Is_Healthy
False    6109
True     5006
Name: count, dtype: int64


In [None]:
print("Start training...")

train_idx = np.array(train_subset.indices)
val_idx   = np.array(val_subset.indices)

train_labels = all_labels[train_idx]
val_labels   = all_labels[val_idx]
assert (train_labels != -1).all()
assert (val_labels != -1).all()

train_dataset = FinetuneDataset(all_corpus, train_idx, train_labels)
val_dataset   = FinetuneDataset(all_corpus, val_idx, val_labels)

callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=callbacks,
)

trainer.train()

# 保存模型
os.makedirs(args.output, exist_ok=True)
trainer.save_model(args.output)

# 保存 label encoder
dump(le, open(os.path.join(args.output, "label_encoder.pkl"), "wb"))
print(f"Model and label encoder saved to: {args.output}")

# 保存日志
logs = trainer.state.log_history
logs_df = pd.DataFrame(logs)

os.makedirs(args.log, exist_ok=True)
log_path = os.path.join(args.log, "finetune_log.csv")
logs_df.to_csv(log_path, index=False)

print(f"Training logs saved to: {log_path}")
logs_df.tail()


Start training...


  return {'input_ids': torch.tensor(tokens),


Epoch,Training Loss,Validation Loss
1,0.1837,0.172317
2,0.117,0.142131


  return {'input_ids': torch.tensor(tokens),
  return {'input_ids': torch.tensor(tokens),
