## 基于transformers实现模型微调训练的主要流程，包括：
- 数据集下载
- 数据预处理
- 训练超参数配置
- 训练评估指标设置
- 训练器基本介绍
- 实战训练
- 模型保存

In [None]:
### import pkgs
from datasets import load_dataset

import random
import padas as pd
import datasets
from IPython.display import display, HTML

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [None]:
#base 
model_dir = "models/bert-base-cased-finetune-yelp"
model_name_or_path = "hugging face model file path" # TODO

## function
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."

    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column]=df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

### 下载数据集

In [None]:
# 手动下载模型，使用huggingface-cli
# huggingface-cli download --resume-download bert-base-cased --local-dir bert-base-cased
dataset = load_dataset("yelp_review_full")
dataset

In [None]:
dataset["train"][100]

In [None]:
show_random_elements(dataset["train"])

### 预处理数据

In [None]:
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding = "max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

show_random_elements(tokenized_datasets["train"], num_examples=1)

In [None]:
### 数据抽样

# shuffle()函数会随机重新排列列的值。如果您希望对用于洗牌数据集的算法有更多控制，可以在此函数中指定generator参数来使用不同的numpy.random.Generator。

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### 微调训练配置

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
#model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=5)

In [None]:
# 最重要配置：模型权重保存路径（output_dir）

training_args = TrainingArguments(output_dir=model_dir,
                                  per_device_train_batch_size=16,
                                  num_train_epochs=5,
                                  logging_steps=100)

print(training_args)

### 训练过程中的指标评估（Evaluate）

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [None]:
# 训练过程指标监控
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", 
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=30)

### 开始训练

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=small_train_dataset,
                  eval_dataset=small_eval_dataset,
                  compute_metrics=compute_metrics)

trainer.train()

In [None]:
# 使用nvidia-smi查看GPU的使用
!watch -n 1 nvidia-smi

In [None]:
## 测试

small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(100))

trainer.evaluate(small_test_dataset)

### 保存模型和训练状态

In [None]:
trainer.save_model(model_dir)
trainer.save_state()

### Homework:使用完整的yelpReviewFull数据集训练，看看ACC最高到多少

In [None]:
# TODO: Homework:使用完整的yelpReviewFull数据集训练bert-base-cased模型，看看ACC最高到多少
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions,references=labels)

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", 
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  # logging_steps=30 # 这个logging会不会占满磁盘，查找函数，然后限制logging的数量
                                  )  

full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=full_train_dataset,
                  eval_dataset=full_eval_dataset,
                  compute_metrics=compute_metrics)

trainer.train() ## 查看 training loss 和 validation loss 和 accuracy

In [None]:
## 测试

full_test_dataset = tokenized_datasets["test"]
trainer.evaluate(full_test_dataset) ## 查看 test loss 和 accuracy