## 0. 导入依赖

In [None]:
import os
import warnings
import pandas as pd
from pickle import load
from torch.utils.data import random_split
from importlib.resources import files
from configparser import ConfigParser
from argparse import Namespace
from transformers import (
    GPT2LMHeadModel,
    GPT2Config,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_callback import EarlyStoppingCallback

warnings.filterwarnings("ignore")

## 1. 基础设置

In [None]:
args = Namespace(
    mode="pretrain",
    input="../data/try2_withCC/abundance_A_74557.pkl",
    # labels=None,  # 如果要加 label token 则给出 label csv 路径；不需要就设为 None
    output="../models/pretrain_A_74557",
    # model="models/general_model",  # 如果不是从头训练，则需要给出预训练模型的路径
    val_split=0.1,                   # 验证集占比
    log="../logs",                    # 日志和 checkpoint 存放的根目录
    # with_label=False,                # 是否把标签也加入 tokenizer
    # from_scratch=True,               # 是否从头训练
)

## 2. 加载 corpus 和 tokenizer

In [None]:
corpus = load(open(args.input, "rb"))
tokenizer = corpus.tokenizer

print("Number of samples in corpus:", len(corpus))
print("Tokenizer vocab size:", tokenizer.vocab_size)

Number of samples in corpus: 1880
Tokenizer vocab size: 1121


## 3. 构建 GPT2Config

In [None]:
cfg = ConfigParser()
cfg.read(files("MiCoGPT")/"resources/config.ini")

gpt2_config_dict = {
    # 模型类别 gpt2
    "model_type":   cfg.get("GPT2", "model_type"),
    # tokenizer 词表大小 1121
    "vocab_size":   tokenizer.vocab_size,
    # 支持的最大序列长度（position embedding 的长度）512
    "n_positions":  cfg.getint("GPT2", "n_positions"),
    # hidden size / embedding 维度 256
    "n_embd":       cfg.getint("GPT2", "n_embd"),
    # Transformer block 8 层
    "n_layer":      cfg.getint("GPT2", "n_layer"),
    # Multi-head Self-Attention 8 头
    "n_head":       cfg.getint("GPT2", "n_head"),
    # bos_token_id: 1119
    "bos_token_id": tokenizer.bos_token_id,
    # eos_token_id: 1120
    "eos_token_id": tokenizer.eos_token_id,
    # pad_token_id: 0
    "pad_token_id": tokenizer.pad_token_id,
}

# 额外参数还有
# attn_pdrop: 0.1, 训练时对 attention 做随机丢弃比例
# embd_pdrop: 0.1, 训练时对 embedding 做随机丢弃比例
# resid_pdrop: 0.1, 训练时对 residual 做随机丢弃比例
# layer_norm_epsilon: 1e-05, 层归一化的 epsilon 超参数，防止除0
# initializer_range: 0.02, 初始化时的范围，用于初始化权重
# activation_function: "gelu_new", 激活函数，使用 GELU 新变体
# scale_attn_weights: true, 是否缩放 attention 权重，默认 true
# scale_attn_by_inverse_layer_idx: false, 是否根据层索引逆比例缩放 attention 权重，默认 false
# reorder_and_upcast_attn: false, 是否在计算 attention 时重新排序并升级为 float32，默认 false
# summary_type: "cls_index", 序列总结类型，使用 cls_token_index 作为总结，默认 "cls_index"
# summary_use_proj: true, 是否对序列总结进行投影，默认 true
# summary_activation: null, 序列总结的激活函数，默认 null
# summary_first_dropout: 0.1, 训练时对序列总结做随机丢弃比例
# summary_proj_to_labels: true, 是否将序列总结投影到标签空间，默认 true
# use_cache: true, 是否使用缓存，默认 true
# transformers_version: "4.33.3"

config = GPT2Config(**gpt2_config_dict)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 1119,
  "embd_pdrop": 0.1,
  "eos_token_id": 1120,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 256,
  "n_head": 8,
  "n_inner": null,
  "n_layer": 8,
  "n_positions": 512,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.33.3",
  "use_cache": true,
  "vocab_size": 1121
}

## 4. 构建 TrainingArguments（从 cfg 的 [pretrain] 段读取）

In [None]:
training_args_dict = {

    # 会运行训练 loop（trainer.train() 时真正训练）。
    "do_train": True,
    # 在训练过程中会按 evaluation_strategy 去跑验证集。
    "do_eval": True,
    # 按步数做 eval
    "evaluation_strategy": "steps",
    # 每训练 500 个 step，跑一次验证
    "eval_steps": cfg.getint("pretrain", "eval_steps"),
    # 按步数做 save
    "save_strategy": "steps",
    # 每训练 500 个 step，保存一次模型
    "save_steps": cfg.getint("pretrain", "save_steps"),

    # 让 DataLoader 按样本长度把数据分成“长度相近”的 batch。
    "group_by_length": True,
    # 数据集中表示句子长度的那一列的名字叫 "length"
    "length_column_name": "length",

    # 显示进度条
    "disable_tqdm": False,

    # 学习率 1e-3
    "learning_rate": cfg.getfloat("pretrain", "learning_rate"), 
    # 学习率调度器类型，线性
    "lr_scheduler_type": "linear",
    # 预热步数 1000，前 1000 步 0 → lr
    "warmup_steps": cfg.getint("pretrain", "warmup_steps"),
    # 权重衰减系数 0.001，防止过拟合
    "weight_decay": cfg.getfloat("pretrain", "weight_decay"),

    # 每个 GPU（或 CPU）上的训练 batch size = 32
    "per_device_train_batch_size": cfg.getint(
        "pretrain", "per_device_train_batch_size"
    ),

    # 在全量训练集上跑 50 个 epoch
    "num_train_epochs": cfg.getint("pretrain", "num_train_epochs"),

    # 每 100 step 打一次 log（loss、学习率等）
    "logging_steps": cfg.getint("pretrain", "logging_steps"),
    # 训练日志和模型 checkpoint 保存到 args.log 目录
    "output_dir": f"{args.log}/pretrain_checkpoints",
    "logging_dir": args.log,
    # 训练结束后，加载验证集上表现最好的模型
    "load_best_model_at_end": True,
}

# 额外参数还有
# adam_beta1=0.9,  Adam 优化器的 beta1 超参数，默认 0.9
# adam_beta2=0.999,  Adam 优化器的 beta2 超参数，默认 0.999
# adam_epsilon=1e-08,  Adam 优化器的 epsilon 超参数，默认 1e-08
# optim=adamw_torch,  AdamW 优化器，默认 adamw_torch
# max_grad_norm=1.0,  最大梯度范数，默认 1.0    
# fp16=False,  是否使用 fp16 混合精度训练，默认 False
# bf16=False,  是否使用 bf16 混合精度训练，默认 False
# no_cuda=False,  是否禁用 CUDA，默认 False
# use_cpu=False,  是否使用 CPU 训练，默认 False
# use_mps_device=False,  是否使用 MPS 设备（Apple Silicon）训练，默认 False
# per_device_eval_batch_size=8,  每个 GPU（或 CPU）上的评估 batch size，默认 8
# gradient_accumulation_steps=1,  梯度累加步数，默认 1
# dataloader_num_workers=0,  数据加载器的工作线程数，默认 0
# dataloader_pin_memory=True,  是否将数据加载到 pinned memory，默认 True
# dataloader_drop_last=False,  是否丢弃最后一个不完整的 batch，默认 False
# seed=42,  随机种子，默认 42
# skip_memory_metrics=True,  是否跳过内存指标计算，默认 True
# ddp_backend=None, 多卡大规模训练,DDP 后端，默认 None
# fsdp=[],  多卡大规模训练，FSDP 配置，默认 []
# deepspeed=None,  多卡大规模训练，DeepSpeed 配置，默认 None
# sharded_ddp=[],  多卡大规模训练，Sharded DDP 配置，默认 []
# push_to_hub=False,  是否将模型上传到 Hugging Face Hub，默认 False
# hub_strategy=every_save,  上传模型的策略，默认 every_save
# report_to=[],  报告指标到的服务，默认 []


training_args = TrainingArguments(**training_args_dict)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=True,
half_precision_backend=auto,
hub_always_push=False,
hub_model

## 5. 构建数据 collator + 初始化模型

In [None]:
print("Start training...")

# causal LM 的 collator，和原 pretrain 一样关闭 MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False, # mlm 指 Masked Language Modeling (BERT)
)

model = GPT2LMHeadModel(config)
print("Training from scratch.")

# 切换为训练模式
model.train()
model

Start training...
Training from scratch.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(1121, 256)
    (wpe): Embedding(512, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=256, out_features=1121, bias=False)
)

## 6. 划分 train/val、构建 Trainer 并训练 + 保存模型和日志

In [None]:
# 按比例划分训练 / 验证集
split = args.val_split

train_set, val_set = random_split(
    corpus,
    [1 - split, split],
)

print(f"Train samples: {len(train_set)}, Val samples: {len(val_set)}")

# 提前停止回调
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# 构建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    callbacks=callbacks,
)

# 开始训练
trainer.train()

# 保存最终模型
os.makedirs(args.output, exist_ok=True)
trainer.save_model(args.output)
print("Model saved to:", args.output)

# 保存训练日志\
logs = trainer.state.log_history
logs = pd.DataFrame(logs)
os.makedirs(args.log, exist_ok=True)
log_path = os.path.join(args.log, "pretrain_log.csv")
logs.to_csv(log_path, index=False)
print("Logs saved to:", log_path)