## 0. 导入依赖

In [None]:
import os
import warnings
from pickle import load, dump

import torch
from torch.utils.data import random_split
import pandas as pd

from configparser import ConfigParser

import sys
sys.path.append(os.path.abspath(".."))  # 从 notebooks/ 回到外面的 MiCoGPT 根目录
from MiCoGPT.utils.corpus import MicroCorpus, MicroCorpusWithLabelTokens

from transformers import (
    GPT2LMHeadModel,
    GPT2Config,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_callback import EarlyStoppingCallback

warnings.filterwarnings("ignore")

## 1. 读取配置 & 基础设置

In [None]:
from argparse import Namespace

# 读配置
cfg = ConfigParser()
cfg.read("../MiCoGPT/resources/mgm_config.ini")

args = Namespace(
    mode="pretrain",
    input="../data/try2_withCC/abundance_A_74557.pkl",
    labels=None,  # 如果要加 label token 则给出 label csv 路径；不需要就设为 None
    output="../models/pretrain_A_74557",
    # model="models/general_model",  # 如果不是从头训练，则需要给出预训练模型的路径
    val_split=0.1,                   # 验证集占比
    log="../log",                    # 日志和 checkpoint 存放的根目录
    with_label=False,                # 是否把标签也加入 tokenizer
    from_scratch=True,               # 是否从头训练
)


## 2. 加载 corpus 和 tokenizer

In [3]:
# 加载 corpus
corpus = load(open(args.input, "rb"))
# 使用 corpus 自带的 tokenizer
tokenizer = corpus.tokenizer

print("Number of samples in corpus:", len(corpus))
print("Tokenizer vocab size:", tokenizer.vocab_size)

Number of samples in corpus: 1880
Tokenizer vocab size: 1121


## 3. 构建 GPT2Config（从 cfg 读超参数）

In [4]:
gpt2_config_dict = {
    "model_type":   cfg.get("GPT2", "model_type"),
    "vocab_size":   tokenizer.vocab_size,
    "n_positions":  cfg.getint("GPT2", "n_positions"),
    "n_embd":       cfg.getint("GPT2", "n_embd"),
    "n_layer":      cfg.getint("GPT2", "n_layer"),
    "n_head":       cfg.getint("GPT2", "n_head"),
    "bos_token_id": tokenizer.bos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.pad_token_id,
}

config = GPT2Config(**gpt2_config_dict)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 1119,
  "embd_pdrop": 0.1,
  "eos_token_id": 1120,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 256,
  "n_head": 8,
  "n_inner": null,
  "n_layer": 8,
  "n_positions": 512,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.33.3",
  "use_cache": true,
  "vocab_size": 1121
}

## 4. 构建 TrainingArguments（从 cfg 的 [pretrain] 段读取）

In [5]:
training_args_dict = {
    "learning_rate": cfg.getfloat("pretrain", "learning_rate"),
    "do_train": True,
    "do_eval": True,
    "group_by_length": True,
    "length_column_name": "length",
    "disable_tqdm": False,
    "lr_scheduler_type": "linear",
    "warmup_steps": cfg.getint("pretrain", "warmup_steps"),
    "weight_decay": cfg.getfloat("pretrain", "weight_decay"),
    "per_device_train_batch_size": cfg.getint(
        "pretrain", "per_device_train_batch_size"
    ),
    "num_train_epochs": cfg.getint("pretrain", "num_train_epochs"),
    "evaluation_strategy": "steps",
    "eval_steps": cfg.getint("pretrain", "eval_steps"),
    "save_strategy": "steps",
    "save_steps": cfg.getint("pretrain", "save_steps"),
    "logging_steps": cfg.getint("pretrain", "logging_steps"),
    "output_dir": f"{args.log}/pretrain_checkpoints",
    "logging_dir": args.log,
    "load_best_model_at_end": True,
}

training_args = TrainingArguments(**training_args_dict)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=True,
half_precision_backend=auto,
hub_always_push=False,
hub_model

## 可选的 label token 逻辑（--with-label）

In [6]:
if args.with_label:
    if args.labels is None:
        raise ValueError("Please provide labels for pretraining (args.labels).")
    
    # 读 label csv，第一列是 index
    metadata = pd.read_csv(args.labels, index_col=0)
    
    # 对齐到 corpus 的样本顺序
    metadata = metadata.loc[corpus.data.index]
    
    # 原始 token 序列
    tokens = corpus.tokens
    
    # 唯一的标签列表，作为要扩展的 token
    extend_words = metadata.iloc[:, 0].unique().tolist()
    
    # tokenizer 加上这些 label token
    tokenizer.add_tokens(extend_words)
    
    # 保存更新后的 tokenizer，放在输出目录里
    os.makedirs(args.output, exist_ok=True)
    dump(tokenizer, open(f"{args.output}/tokenizer.pkl", "wb"))
    
    # 用带 label token 的版本替换原来的 corpus
    corpus = MicroCorpusWithLabelTokens(
        tokens,
        metadata.iloc[:, 0].values.tolist(),
        tokenizer,
    )

    print("Extended tokenizer with labels, new vocab size:", tokenizer.vocab_size)
else:
    print("args.with_label = False，跳过标签 token 相关步骤。")


args.with_label = False，跳过标签 token 相关步骤。


## 5. 构建数据 collator + 初始化模型

In [7]:
print("Start training...")

# causal LM 的 collator，和原 pretrain 一样关闭 MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 选择从头训练还是从已有权重加载
if args.from_scratch:
    model = GPT2LMHeadModel(config)
    print("Training from scratch.")
else:
    model = GPT2LMHeadModel.from_pretrained(args.model)
    print("Loading model from:", args.model)

# 如果使用了 label token，要扩展 embedding
if args.with_label:
    model.resize_token_embeddings(len(tokenizer))
    print("Update the embedding layer to include the label embedding.")

model = model.train()
model

Start training...
Training from scratch.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(1121, 256)
    (wpe): Embedding(512, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=256, out_features=1121, bias=False)
)

## 6. 划分 train/val、构建 Trainer 并训练 + 保存模型和日志

In [None]:
# 按比例划分训练 / 验证集
split = args.val_split

train_set, val_set = random_split(
    corpus,
    [1 - split, split],
)

print(f"Train samples: {len(train_set)}, Val samples: {len(val_set)}")

# 提前停止回调
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# 构建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    callbacks=callbacks,
)

# 开始训练
trainer.train()

# 保存最终模型
os.makedirs(args.output, exist_ok=True)
trainer.save_model(args.output)
print("Model saved to:", args.output)

# 保存训练日志（和原脚本一致）
logs = trainer.state.log_history
logs = pd.DataFrame(logs)
os.makedirs(args.log, exist_ok=True)
log_path = os.path.join(args.log, "pretrain_log.csv")
logs.to_csv(log_path, index=False)
print("Logs saved to:", log_path)
