## 文本分类实例

### Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

### Step2 加载数据

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

### Step3 创建DataSet

### Step4 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [4]:
len(datasets["train"]),len(datasets["test"])

(6988, 777)

In [5]:
for i in range(10):
    print(datasets["train"][i],datasets["test"][i])

{'label': 1, 'review': '其实这个价格相对这个状况的酒店已经不便宜了（360元），这个季节，这个城市乱七八糟的。但是相对这个（4星）级别的，已经算是还可以了'} {'label': 1, 'review': '作为喜达屋的金卡会员,喜来登就是品质的保证,所以好的不多说了,鸡蛋里挑骨头说点需要改进的吧.1.房门自动关不紧,要拉一下,有点危险.2.前两次入住一早会把帐单从门缝里塞进来,感觉很细致,这次却没有,可能是入住人太多了忙不过来.3.中午吃的自助餐,海蟹肯定不太新鲜,要重视啊!4.最不爽的是：这一年价钱涨得太快了！另外，这次入住的是高级大床房，感觉反而没有舒适大床房好。个人觉得喜达屋旗下的酒店各有千秋，象艾美是奢华和大气，豪达喜来登是复古和庄重，福朋的特色则应该是精巧雅致，房间一大反而失去了那种紧凑感。（所以楼下有人评福朋的房间小应该就是指的舒适房，但岂不知这才是她的特色。）还有携程的信息可能有点问题，舒适大床房中的大床可能是queensize的，高级大床房的大床可能是kingsize，两者不仅是房间面积的不同，有机会应该核证一下。'}
{'label': 0, 'review': '房间环境很差,没有窗户,而且入住时抽水马桶是坏的,打了两次电话给服务人员才来修理并且没有完全修好,提出换同等房间被拒绝.希望提高住宿环境和硬件质量.'} {'label': 1, 'review': '非常满意，服务贴心到位，还主动把携程的预定门票换成门票，我们可以不排队。房间干净整洁，还很漂亮，很赞的'}
{'label': 1, 'review': '1、总体感觉有些旧，地毯上污迹比较多。2、服务等还可以。3、房间总带有一些烟味，不喜欢。'} {'label': 1, 'review': '果然如同其他会员们所说的：位置优越，但不太好找。让我体会了一把京城小胡同的韵味。周围还是在拆迁。酒店感觉较温馨，要是能够在每个房间里都放上吹风机就好了。还是值得向朋友们推荐的。'}
{'label': 1, 'review': '酒店的设施是很不错的，房间大，设施新，床和被子都是很新的。洗手间也是干、湿分离。服务员态度也是很好，出入都会打招呼。就是餐厅大堂中，适合2－3人吃饭的桌子太少了。还有就是房价在衢州应该属于偏高了。'} {'label': 0, 'review':

### Step5 创建Dataloader

In [6]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [8]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [9]:
next(enumerate(trainloader))[1]

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101,  679, 7231,  ...,  671, 4275,  102],
        [ 101,  683, 7305,  ...,    0,    0,    0],
        ...,
        [ 101,  702,  782,  ...,    0,    0,    0],
        [ 101,  126, 3299,  ...,  119, 6844,  102],
        [ 101,  817, 3419,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 0])}

### Step6 创建模型及优化器

In [10]:
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
optimizer = Adam(model.parameters(), lr=2e-5)

### Step7 训练与验证

In [14]:
import evaluate
clf_metrics = evaluate.combine(["accuracy","f1","recall","precision"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

In [15]:
def evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            clf_metrics.add_batch(predictions=pred.long(),references=batch["labels"].long())
            
    return clf_metrics.compute()


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        clf = evaluate()
        print(f"ep: {ep}, {clf}")

In [16]:
train()

ep: 0, global_step: 0, loss: 0.29142141342163086
ep: 0, global_step: 100, loss: 0.16152766346931458
ep: 0, global_step: 200, loss: 0.21962319314479828
ep: 0, {'accuracy': 0.8751608751608752, 'f1': 0.9066410009624639, 'recall': 0.8836772983114447, 'precision': 0.9308300395256917}
ep: 1, global_step: 300, loss: 0.13791930675506592
ep: 1, global_step: 400, loss: 0.34164679050445557
ep: 1, {'accuracy': 0.879021879021879, 'f1': 0.9113207547169812, 'recall': 0.9061913696060038, 'precision': 0.9165085388994307}
ep: 2, global_step: 500, loss: 0.2195776402950287
ep: 2, global_step: 600, loss: 0.2111939787864685
ep: 2, {'accuracy': 0.8751608751608752, 'f1': 0.9110907424381302, 'recall': 0.9324577861163227, 'precision': 0.8906810035842294}


### Step8 模型预测

In [17]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    score = torch.softmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}\nScore:{score[0][pred.item()]}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！
Score:0.9975985884666443


In [18]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [19]:
pipe(sen)

[{'label': '好评！', 'score': 0.9975985884666443}]

In [20]:
pipe("污渍不行")

[{'label': '差评！', 'score': 0.8519829511642456}]