# 文本分类实战

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

## Step2 加载数据集

In [2]:
dataset = load_dataset('csv', data_files='./ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x:x['review'] is not None)
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-3260f3d9eacc9812/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-3260f3d9eacc9812/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-fe6f140ff6bc9b22.arrow


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 创建Dataloader

In [8]:
import torch

tokenizer = AutoTokenizer.from_pretrained('../../models/rbt3/')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128,truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)
tokenized_datasets


                                                                  

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [9]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenized_datasets['train'], tokenized_datasets['test']
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

2023-10-13 16:27:43.779737: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-13 16:27:43.812932: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
next(enumerate(validloader))[1]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101, 3300, 2401,  ...,  676, 3517,  102],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101, 1762, 2416,  ..., 2523, 1962,  102],
        ...,
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 1377,  809,  ...,    0,    0,    0],
        [ 101, 2600, 4638,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 1

## Step5 创建模型及优化器

In [11]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('../../models/rbt3/')
if torch.cuda.is_available():
    model = model.cuda()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../../models/rbt3/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
optimizer = Adam(model.parameters(), lr=2e-5)

## Step6 训练与验证

In [14]:
def evaluate():
    model.eval()
    acc_num = 0

    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()}

            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch['labels'].long()).float().sum()

    return acc_num / len(validset)


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()

            if global_step % log_step == 0:
                print(f'ep : {ep}, global_step: {global_step}, loss: {output.loss.item()}')
            global_step += 1

        acc = evaluate()
        print(f'ep: {ep}, acc: {acc}')

## Step7 模型训练

In [15]:
train()

ep : 0, global_step: 0, loss: 0.7142688035964966
ep : 0, global_step: 100, loss: 0.2952375113964081
ep : 0, global_step: 200, loss: 0.31270188093185425
ep: 0, acc: 0.877734899520874
ep : 1, global_step: 300, loss: 0.18632763624191284
ep : 1, global_step: 400, loss: 0.22733277082443237
ep: 1, acc: 0.8803088665008545
ep : 2, global_step: 500, loss: 0.1526147425174713
ep : 2, global_step: 600, loss: 0.09638947248458862
ep: 2, acc: 0.8828828930854797


## Step8 模型预测

In [16]:
sen = '我觉得这家酒店不错，饭很好吃！'
id2label = {0: '差评！', 1:'好评！'}
model.eval()

with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt')
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f'输入: {sen}\n模型预测结果: {id2label.get(pred.item())}')

输入: 我觉得这家酒店不错，饭很好吃！
模型预测结果: 好评！


In [17]:
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

In [18]:
pipe(sen)

[{'label': '好评！', 'score': 0.9953296184539795}]