### Step1 导入相关包

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer
from datasets import load_dataset
import torch

### Step2 加载数据集、划分数据集、数据集预处理

In [None]:
# 数据集加载
dataset=load_dataset('csv',data_files='./ChnSentiCorp_htl_all.csv',split='train')
# 数据中不能为空，要不然map映射那里会有错误
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

In [None]:
# 划分数据集
datasets=dataset.train_test_split(test_size=0.1)
datasets

In [None]:
# 数据集预处理，数据映射
tokenizer=AutoTokenizer.from_pretrained('rbt3')

tokenizer=AutoTokenizer.from_pretrained('rbt3')
def preprocess_function(examples):
    tokenizer_examples=tokenizer(examples['review'],max_length=128,truncation=True)
    tokenizer_examples['labels']=examples['label']
    return tokenizer_examples
tokenizer_datasets=datasets.map(preprocess_function,batched=True, remove_columns=datasets["train"].column_names)
tokenizer_datasets

### Step3 创建模型

In [None]:
model=AutoModelForSequenceClassification.from_pretrained('rbt3')

In [None]:
model.config

### Step4 创建评估函数

In [None]:
import evaluate
acc_metric=evaluate.load('accuracy')
f1_metric=evaluate.load('f1')

In [None]:
def eval_metric(eval_predict):
    predictions,labels=eval_predict
    predictons=predictions.argmax(axis=-1)
    acc=acc_metric.compute(predictions=predictions,references=labels)
    f1=f1.compute(predictions=predictions,references=labels)
    acc.update(f1)
    return acc

### Step5 创建TraningArguments

In [None]:
train_args=TrainingArguments(
    output_dir="./checkpoints",      # 输出文件夹
    per_device_train_batch_size=16,  # 训练时的batch_size
    per_device_eval_batch_size=32,  # 验证时的batch_size
    logging_steps=10,                # log 打印的频率
    evaluation_strategy="epoch",     # 评估策略
    save_strategy="epoch",           # 保存策略
    save_total_limit=3,              # 最大保存数
    learning_rate=2e-5,              # 学习率
    weight_decay=0.01,               # weight_decay
    metric_for_best_model="f1",      # 设定评估指标
    load_best_model_at_end=True      # 训练完成后加载最优模型
    )     


### Step6 创建Trainer

In [None]:
from transformers import DataCollatorWithPadding
trainer=Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenizer_datasets['train'],
    eval_dataset=tokenizer_datasets['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),#数据处理器，这里使用 DataCollatorWithPadding 类，并将之前初始化的 tokenizer 传递给它。这个数据处理器的作用是对数据进行填充，以匹配模型输入的要求
    compute_metrics=eval_metric
)

### Step 7 模型训练

In [None]:
trainer.train()

### Step8 模型评估

In [None]:
trainer.evaluate(tokenizer_datasets['test'])

### Step9 模型预测

In [None]:
trainer.predict(tokenized_datasets['test'])

In [None]:
from transformers import pipeline

pipe=pipeline('text-classification',model=model,tokenizer=tokenizer,device=0)

In [None]:
sen='我觉得不错！'
pipe(sen)