In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

#### Step1 数据加载、划分数据集、DataLoader

In [None]:
# 加载数据集
dataset=load_dataset('csv',data_files='ChnSentiCorp_htl_all.csv',split='train')
dataset=dataset.filter(lambda x:x['review'] is not None)
dataset

In [None]:
# 划分数据集
datasets=dataset.train_test_split(test_size=0.1)
datasets

In [None]:
# DataLoader
tokenizer=AutoTokenizer.from_pretrained('rbt3')
def preprocess_function(examples):
    tokenizer_examples=tokenizer(examples['review'],max_length=128,truncation=True)
    tokenizer_examples['labels']=examples['label']
    return tokenizer_examples
tokenizer_datasets=datasets.map(preprocess_function,batched=True,remove_columns=datasets['train'].column_names)
tokenizer_datasets

In [None]:
from transformers import DataCollatorWithPadding

trainset,validset=tokenizer_datasets['train'],tokenizer_datasets['test']
trainloader=DataLoader(trainset,batch_size=32,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer))
validloader=DataLoader(validset,batch_size=64,shuffle=False,collate_fn=DataCollatorWithPadding(tokenizer))

#### Step2 模型创建与优化器

In [None]:
model=AutoModelForSequenceClassification.from_pretrained('rbt3')
optim=Adam(model.parameters(),lr=2e-5)

#### Step3 模型训练

In [None]:
for ep in range(2):
    model.train()
    for batch in trainloader:
        batch={k:v for k,v in batch.items()}
        output=model(**batch)
        optim.zero_grad()
        output.loss.backward()
        optim.step()
    print(f'ep:{ep},loss:{output.loss.item()}')

#### Step4 模型验证

In [None]:
import evaluate

clf_metrics=evaluate.load(['accuracy','f1'])

In [None]:
with torch.inference_mode():
    for batch in validloader:
        batch={k:v for k,v in batch.items()}
        output=torch.argmax(model(**batch).logits,dim=-1)
        clf_metrics.add_batch(output,batch['labels'])
        print(clf_metrics.compute)
