In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset,DataLoader,random_split
from torch.optim import Adam
from transformers import AutoTokenizer,AutoModelForSequenceClassification

#### 1.数据读取

In [None]:
data=pd.read_csv('./ChnSentiCorp_htl_all.csv')
data=data.dropna()
data.info()

#### 2.数据加工（dataset,数据集的划分，dataloader）

In [None]:
# dataset
class MyDataset(Dataset):
    def __init__(self):
        super().__init__()
        # 只读取2000条数据
        self.data=pd.read_csv('./ChnSentiCorp_htl_all.csv').dropna()[:2000]
    def __getitem__(self, index):
        return self.data.iloc[index]['review'],self.data.iloc[index]['label']
    def __len__(self):
        return len(self.data)

In [None]:
my_dataset=MyDataset()
for i in range(4):
    print(my_dataset[i])

In [None]:
# 数据集的划分
train_dataset,test_dataset=random_split(my_dataset,lengths=[0.8,0.2])

In [None]:
# DataLoader
tokenizer=AutoTokenizer.from_pretrained('rbt3')

def collate_fn(dataset):
    texts,labels=[],[]
    for item in dataset:
        texts.append(item[0])
        labels.append(item[1])
    inputs=tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    inputs['labels']=torch.tensor(labels)
    return inputs

train_dataloader=DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=collate_fn)
test_dataloader=DataLoader(test_dataset,batch_size=64,shuffle=False,collate_fn=collate_fn)

#### 3.模型创建与优化器

In [None]:
model=AutoModelForSequenceClassification.from_pretrained('rbt3')
optim=Adam(model.parameters(),lr=2e-5)

#### 4.模型训练

In [None]:
for ep in range(2):
    for batch in train_dataloader:
        batch={k:v for k,v in batch.items()}
        optim.zero_grad()
        output=model(**batch)
        output.loss.backward()
        optim.step()
    print(f"ep: {ep}, loss: {output.loss.item()}")

In [None]:
with torch.inference_mode():
    acc_num=0
    for batch in test_dataloader:
        inputs={k:v for k,v in batch.items()}
        pre=torch.argmax(model(**inputs).logits,dim=-1)
        acc_num+=(pre==batch['labels']).float().sum()
    acc=acc_num/len(test_dataloader)
    print(acc)

#### 5.模型预测

In [None]:
sen='我觉得这家酒店不错，饭很好吃'
id2_label={1:'好评',0:'差评'}

with torch.inference_mode():
    input=tokenizer(sen,max_length=128,padding='max_length',truncation=True,return_tensors='pt')
    input={k:v for k,v in input.items()}
    result=torch.argmax(model(**input).logits)
    print(id2_label.get(result.item()))


#### 使用pipeline预测

In [None]:
from transformers import pipeline

model.config.id2label=id2_label
pipe=pipeline('text-classification',model=model,tokenizer=tokenizer)