# Hugging Face Transformers 微调训练入门

## Homework: 使用完整的 YelpReviewFull 数据集训练，看 Acc 最高能到多少

### 下载/加载数据集

In [3]:
# 下载/加载数据集
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

### 预处理数据

In [4]:
# 预处理数据
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



### 拆分训练集

In [5]:
# 设置训练(90%)/验证数据集(10%)
# 这里我从训练集中抽出了10%作为验证集，从Chatgpt的回复看，验证集不参与训练，但是从验证集评估结果，可能会调整超参数，建议是不要和测试集重叠。
# 但是我这边练习时并不会去根据评估结果调整超参数，如果用测试集作为验证集，训练集还能更大更全点。
train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.1)
train_dataset = train_testvalid['train']
valid_dataset = train_testvalid['test']
test_dataset = tokenized_datasets['test']
print(f"train: {len(train_dataset)}; valid: {len(valid_dataset)}; test: {len(test_dataset)}")

train: 585000; valid: 65000; test: 50000


### 设置评估指标

In [6]:
# 设置评估指标
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### 加载Bert模型

In [8]:
# 加载Bert模型
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 设置训练超参数

In [25]:
# 训练超参数
from transformers import TrainingArguments, Trainer

model_dir = "models/bert-base-cased-finetune-yelp"
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="steps",
                                  eval_steps=1000,
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=1000
                                )

### 实例化训练器

In [26]:
# 实例化训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

### 开始训练

In [27]:
# 开始训练
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,1.0063,0.896294,0.609446
2000,0.925,0.988739,0.570277
3000,0.8974,0.879501,0.615354
4000,0.8813,0.908441,0.620523
5000,0.8763,0.839609,0.630754
6000,0.8637,0.824819,0.638138
7000,0.852,0.811158,0.6452
8000,0.8467,0.855841,0.616938
9000,0.8404,0.825042,0.639031
10000,0.8404,0.812349,0.652338


KeyboardInterrupt: 

### 问题：验证集评估太频繁影响训练效率

In [29]:
# 训练超参数
from transformers import TrainingArguments, Trainer

model_dir = "models/bert-base-cased-finetune-yelp"
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=1000
                                )

In [32]:
# 实例化训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [34]:
# 继续训练模型
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss,Accuracy
3,0.5349,0.766864,0.689215


TrainOutput(global_step=109689, training_loss=0.04552924071479934, metrics={'train_runtime': 14528.947, 'train_samples_per_second': 120.793, 'train_steps_per_second': 7.55, 'total_flos': 4.6177234002432e+17, 'train_loss': 0.04552924071479934, 'epoch': 3.0})

### 问题: 硬盘容量不足，训练暂停

硬盘从300G扩容至1T后，继续训练。

**原因**：checkpoint占用太多
虽然保存检查点可以在训练中断时提供恢复能力，但保存过于频繁会占用大量磁盘空间并降低训练速度。通过合理设置 `save_steps`、`save_total_limit` 和 `save_strategy` 参数，可以找到一个平衡点，既保证模型训练的可靠性，又不会过多地消耗资源。

### 保存模型和训练状态

In [38]:
# 保存模型和训练状态
trainer.save_model(model_dir)
trainer.save_state()

### 使用测试集评估

In [1]:
# # 下载/加载数据集
# from datasets import load_dataset
# dataset = load_dataset("yelp_review_full")

# # 预处理数据
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # 拆分数据集
# train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.1)
# train_dataset = train_testvalid['train']
# valid_dataset = train_testvalid['test']
# test_dataset = tokenized_datasets['test']
# print(f"train: {len(train_dataset)}; valid: {len(valid_dataset)}; test: {len(test_dataset)}")

# # 加载模型
# from transformers import AutoModelForSequenceClassification
# model_dir = "models/bert-base-cased-finetune-yelp"
# model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# # 设置评估指标
# import numpy as np
# import evaluate

# metric = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# # 训练超参数
# from transformers import TrainingArguments, Trainer
# training_args = TrainingArguments(output_dir=model_dir,
#                                   evaluation_strategy="epoch",
#                                   per_device_train_batch_size=16,
#                                   num_train_epochs=3,
#                                   logging_steps=500
#                                 )

# # 实例化训练器
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     compute_metrics=compute_metrics
# )

trainer.evaluate(test_dataset)



train: 585000; valid: 65000; test: 50000


{'eval_loss': 0.7690914869308472,
 'eval_accuracy': 0.68698,
 'eval_runtime': 1501.5222,
 'eval_samples_per_second': 33.3,
 'eval_steps_per_second': 4.162}

### 通过checkpoints计算评估训练中间结果（延后）
训练因为磁盘满中断，原代码块忘记复制后执行，导致训练前两轮评估记录丢失，尝试从state和checkpoints得到评估数据。

In [9]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

# 检查点列表
checkpoints = [36000, 72000, 109000]

# 存储评估结果
results = []

# 评估每个检查点，使用tqdm显示进度条
for checkpoint in tqdm(checkpoints, desc="Evaluating checkpoints"):
    checkpoint_path = f"./models/bert-base-cased-finetune-yelp//checkpoint-{checkpoint}"
    # 加载特定检查点的模型
    chp_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
    
    # 创建新的Trainer对象
    chp_trainer = Trainer(
        model=chp_model,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )
    
    # 评估模型
    eval_results = chp_trainer.evaluate()
    
    # 提取训练损失（需要从保存的训练状态中提取）
    train_loss = None
    if trainer_state and hasattr(trainer_state, 'log_history'):
        for log in trainer_state.log_history:
            if log.get('step', None) == checkpoint and 'loss' in log:
                train_loss = log['loss']
                break
    
    # 记录结果
    results.append({
        'steps': checkpoint,
        'training_loss': train_loss,
        'validation_loss': eval_results['eval_loss'],
        'accuracy': eval_results['eval_accuracy']
    })

# 创建DataFrame并打印结果
df = pd.DataFrame(results)

# 可选：将结果保存到CSV文件
df.to_csv("text_classification_eval_results.csv", index=False)

Evaluating checkpoints:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
df

Unnamed: 0,steps,training_loss,validation_loss,accuracy
0,36000,,0.713713,0.692831
1,72000,,0.623812,0.732292
2,109000,,0.526862,0.784892


跑上段代码时，valid_dataset划分改变了，里面混入了参与训练的样本，所以accuracy看起来变高了。

应该用测试集，看看在不同checkpoint的评估结果。  

traing_loss拿不到就算了。

In [11]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

# 检查点列表
checkpoints = [36000, 72000, 109000]

# 存储评估结果
results = []

# 评估每个检查点，使用tqdm显示进度条
for checkpoint in tqdm(checkpoints, desc="Evaluating checkpoints"):
    checkpoint_path = f"./models/bert-base-cased-finetune-yelp//checkpoint-{checkpoint}"
    # 加载特定检查点的模型
    chp_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
    
    # 创建新的Trainer对象
    chp_trainer = Trainer(
        model=chp_model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    
    # 评估模型
    eval_results = chp_trainer.evaluate()
    
    # 记录结果
    results.append({
        'steps': checkpoint,
        'validation_loss': eval_results['eval_loss'],
        'accuracy': eval_results['eval_accuracy']
    })

# 创建DataFrame并打印结果
df = pd.DataFrame(results)

# 可选：将结果保存到CSV文件
df.to_csv("text_classification_eval_results.csv", index=False)

Evaluating checkpoints:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
df

Unnamed: 0,steps,validation_loss,accuracy
0,36000,0.764091,0.66892
1,72000,0.731155,0.68264
2,109000,0.769079,0.68718


### 更多epoch训练（延后）

In [None]:
# 下载/加载数据集
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

# 预处理数据
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 拆分数据集
train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.1)
train_dataset = train_testvalid['train']
valid_dataset = train_testvalid['test']
test_dataset = tokenized_datasets['test']
print(f"train: {len(train_dataset)}; valid: {len(valid_dataset)}; test: {len(test_dataset)}")

# 设置评估指标
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    
# 加载模型
from transformers import AutoModelForSequenceClassification
model_dir = "models/bert-base-cased-finetune-yelp"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)


# 训练超参数
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=500,
                                  save_total_limit=5
                                )

# 实例化训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# trainer.train(resume_from_checkpoint=True)

In [None]:
# 保存模型和训练状态
trainer.save_model(model_dir)
trainer.save_state()

In [None]:
trainer.evaluate(test_dataset)