## 文本分类实例

### Step1 导入相关包

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

### Step2 加载数据

In [5]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

### Step3 创建DataSet

### Step4 划分数据集

In [6]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [7]:
len(datasets["train"]),len(datasets["test"])

(6988, 777)

In [8]:
for i in range(10):
    print(datasets["train"][i],datasets["test"][i])

{'label': 0, 'review': '我是四月初去的深圳。我订的是豪华套房，开始在网上开酒店新装修后的照片挺吸引的，但入住后才发现根本不是那样的。除了大塘有装修过的感觉外，楼层和房间都还是老样子，很旧，完全没有装修过。八百多的房费简直就是浪费，下次不会在住了。'} {'label': 1, 'review': '位置比较好，到机场火车站都很方便。豪华单人间里的布置合理有序，独具匠心。值得一提的是门口迎宾员特别热情，帮我提箱子一直到办完入住还送我上了电梯。'}
{'label': 1, 'review': '交通相当便利，既是优点也是缺点，住在十几楼还是有点吵。其它的都还可以吧，四星级嘛。'} {'label': 0, 'review': '特价房环境非常不好，只有一扇非常小的气窗通向宾馆过道，有点象住在地下室的感觉。房间下水道还往上翻难闻的臭气。请慎重选择此宾馆。'}
{'label': 0, 'review': '2/17除夕夜订房预订240标间感觉不错!所以2/25再次入住提升为豪标270元,没想到...房间竟比前一次的标间还要小,淋浴间更小到无法转身,连梳妆台,衣橱也都全消失没有了,向前台反应,怎跟前次住的房型还要糟糕~服务员回应不仅态度恶劣,更以前次入住的房间门锁坏了的理由不给转房~我与女友一气之下,索性于次日凌晨5点多赶紧退房,搬到中央大街诺曼弟酒店入住套房,虽房价多贴20多元,房型却比这"烂远达"大上3倍之多,诺曼弟酒店不仅地段好(若从远达打D到中央大街还要多花14元~"~),前台服务更好!...奉劝各位要入住远达的网友,千万要三思阿!'} {'label': 1, 'review': '住的是8楼豪海房，每天就在海边度过，一个字，爽！酒店服务不错，路上遇见都会微笑问好，早餐还算丰富，而且到11点，可以一起解决午餐问题。大堂很有特色，喜欢，还有它家的沙滩拖鞋很舒服，想买双回家，就是不对外卖。房间有点陈旧，床很舒服，上网费用高了点。一点小问题时浴室淋浴房的下面封条有漏水，害得上厕所都会有影响，鞋子湿湿的，要用毛巾垫着才行，和客房服务提出后也几天也没有解决。'}
{'label': 1, 'review': '别墅型的酒店，非常特别,离海边很近.消费很平价'} {'label': 1, 'review': '服务人员素质好,彬彬有礼.软硬件都很好的

### Step5 创建Dataloader

In [28]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [29]:
tokenized_datasets["train"][0]

{'input_ids': [101,
  2769,
  3221,
  1724,
  3299,
  1159,
  1343,
  4638,
  3918,
  1766,
  511,
  2769,
  6370,
  4638,
  3221,
  6498,
  1290,
  1947,
  2791,
  8024,
  2458,
  1993,
  1762,
  5381,
  677,
  2458,
  6983,
  2421,
  3173,
  6163,
  934,
  1400,
  4638,
  4212,
  4275,
  2923,
  1429,
  2471,
  4638,
  8024,
  852,
  1057,
  857,
  1400,
  2798,
  1355,
  4385,
  3418,
  3315,
  679,
  3221,
  6929,
  3416,
  4638,
  511,
  7370,
  749,
  1920,
  1851,
  3300,
  6163,
  934,
  6814,
  4638,
  2697,
  6230,
  1912,
  8024,
  3517,
  2231,
  1469,
  2791,
  7313,
  6963,
  6820,
  3221,
  5439,
  3416,
  2094,
  8024,
  2523,
  3191,
  8024,
  2130,
  1059,
  3766,
  3300,
  6163,
  934,
  6814,
  511,
  1061,
  4636,
  1914,
  4638,
  2791,
  6589,
  5042,
  4684,
  2218,
  3221,
  3857,
  6589,
  8024,
  678,
  3613,
  679,
  833,
  1762,
  857,
  749,
  511,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [30]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [31]:
next(enumerate(trainloader))[1]

{'input_ids': tensor([[ 101,  671, 5663,  ...,    0,    0,    0],
        [ 101, 1184, 1378,  ...,    0,    0,    0],
        [ 101, 1057,  857,  ..., 4638, 4510,  102],
        ...,
        [ 101, 1139, 2345,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ..., 2769, 6230,  102],
        [ 101, 2791, 7313,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 0, 1])}

### Step6 创建模型及优化器

In [13]:
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
optimizer = Adam(model.parameters(), lr=2e-5)

### Step7 训练与验证

In [15]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")

In [16]:
train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


ep: 0, global_step: 0, loss: 0.6987050771713257
ep: 0, global_step: 100, loss: 0.3915312886238098
ep: 0, global_step: 200, loss: 0.23195083439350128
ep: 0, acc: 0.8931788802146912
ep: 1, global_step: 300, loss: 0.15785916149616241
ep: 1, global_step: 400, loss: 0.34245699644088745
ep: 1, acc: 0.8983269333839417
ep: 2, global_step: 500, loss: 0.15504734218120575
ep: 2, global_step: 600, loss: 0.23264743387699127
ep: 2, acc: 0.9009009003639221


### Step9 模型预测

In [17]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    score = torch.softmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}\nScore:{score[0][pred.item()]}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！
Score:0.9958785772323608


In [18]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [20]:
pipe(sen)

[{'label': '好评！', 'score': 0.9958785772323608}]

In [23]:
pipe("污渍不行")

[{'label': '差评！', 'score': 0.9502854943275452}]