In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_DISABLED"] = "true"
from pathlib import Path
import pickle
import random
import uuid
import datetime
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, DebertaV2Tokenizer

class cfg:
    exp_id = "1001" # 实验ID
    seed = 42 # 随机种子
    data_path = "/home/xm/workspace/nbme-score-clinical-patient-notes/patient_notes.csv" # notes数据
    pretrained_checkpoint = 'microsoft/deberta-base' # microsoft/deberta-large /  microsoft/deberta-v3-large
    lr = 1e-5
    batch_size = 32
    epochs = 10 
    save_total_limit = 2 # 最多checkpoint的数量
    mlm_prob = 0.2 # mlm概率

def seed_everything(seed=42):
    '''
    设置随机种子，方便实验复现
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)

In [None]:
df = pd.read_csv(cfg.data_path) # 读取notes数据
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_checkpoint, trim_offsets=False) # 分词 tokenizer # trim_offsets==False 删除因offsets造成的空白token

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, lines, block_size):
        batch_encoding = tokenizer(
                                    lines, # 文本
                                    add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                                    truncation=True, # 文本截断，则将其截断为max_length参数指定的最大长度.
                                    max_length=block_size, # 文本最大长度
                                  )
        self.examples = batch_encoding["input_ids"] # 获取 input_ids
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples) # 样本长度

    def __getitem__(self, i):
        return self.examples[i] # 返回指定 input_ids

dataset = LineByLineTextDataset(tokenizer, df['pn_history'].tolist(), 512) 

In [None]:
args = TrainingArguments(
    output_dir=f"/home/xm/workspace/output/{cfg.exp_id}", # 保存路径
    save_strategy="epoch", # 以epoch频率保存模型
    learning_rate=cfg.lr, # 学习率
    per_device_train_batch_size=cfg.batch_size, 
    per_device_eval_batch_size=cfg.batch_size, 
    num_train_epochs=cfg.epochs, 
    warmup_ratio=0.2, # 初始学习率倍数
    fp16=True, # 混合精度
    dataloader_num_workers=4, # cpu线程数
    group_by_length=True, # 使用动态padding 更快的训练
    run_name=cfg.exp_id, # 实验ID
    save_total_limit=cfg.save_total_limit if cfg.save_total_limit>0 else None, # 最多checkpoint的数量
    seed=cfg.seed, # 随机种子
)

model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_checkpoint) # MLM 模型

trainer = Trainer(
    model, # 模型
    args, # 超参数
    train_dataset=dataset, # 数据集
    tokenizer=tokenizer, # tokenizer
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_prob), # 数据整理器
)

trainer.train() # 开始pretrain