# 文本分类实例

## Step1 导入相关包

In [5]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments
    )
from datasets import load_dataset

## Step2 加载数据集

In [6]:
dataset = load_dataset('csv', data_files='./ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset



Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [7]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 数据集预处理

In [8]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 10107.39 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 8289.62 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 创建模型

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config

BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

## Step6 创建评估函数

In [11]:
import evaluate

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')



In [12]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainingArguments

In [13]:
train_args = TrainingArguments(output_dir='./checkpoints',             # 输出文件夹
                               per_device_train_batch_size=64,         # 训练时的batch_size
                               per_device_eval_batch_size=128,         # 验证时的batch_size
                               logging_steps=10,                       # log打印的频率
                               evaluation_strategy='epoch',            # 评估策略
                               save_strategy='epoch',                  # 保存策略
                               save_total_limit=3,                     # 最大保存数
                               learning_rate=2e-5,                     # 学习率
                               weight_decay=0.01,                      # weight_decay
                               metric_for_best_model='f1',             # 设定评估指标
                               load_best_model_at_end=True             # 训练完成后加载最佳模型
                               )

train_args


TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_mod

## Step8 创建Trainer

In [14]:
from transformers import DataCollatorWithPadding

trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_datasets['train'],
                  eval_dataset=tokenized_datasets['test'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## Step9 模型训练

In [15]:
trainer.train()

  0%|          | 0/330 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  3%|▎         | 10/330 [00:47<24:49,  4.65s/it]

{'loss': 0.6571, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.09}


  6%|▌         | 20/330 [01:31<23:53,  4.62s/it]

{'loss': 0.5953, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.18}


  9%|▉         | 30/330 [02:14<21:30,  4.30s/it]

{'loss': 0.4899, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.27}


 12%|█▏        | 40/330 [02:57<20:48,  4.31s/it]

{'loss': 0.4651, 'learning_rate': 1.7575757575757576e-05, 'epoch': 0.36}


 15%|█▌        | 50/330 [03:40<20:25,  4.38s/it]

{'loss': 0.4074, 'learning_rate': 1.6969696969696972e-05, 'epoch': 0.45}


 18%|█▊        | 60/330 [04:23<19:07,  4.25s/it]

{'loss': 0.3678, 'learning_rate': 1.6363636363636366e-05, 'epoch': 0.55}


 21%|██        | 70/330 [05:06<18:25,  4.25s/it]

{'loss': 0.3496, 'learning_rate': 1.575757575757576e-05, 'epoch': 0.64}


 24%|██▍       | 80/330 [05:49<17:51,  4.29s/it]

{'loss': 0.3233, 'learning_rate': 1.5151515151515153e-05, 'epoch': 0.73}


 27%|██▋       | 90/330 [06:32<17:19,  4.33s/it]

{'loss': 0.2849, 'learning_rate': 1.4545454545454546e-05, 'epoch': 0.82}


 30%|███       | 100/330 [07:16<17:03,  4.45s/it]

{'loss': 0.3212, 'learning_rate': 1.3939393939393942e-05, 'epoch': 0.91}


 33%|███▎      | 110/330 [07:58<12:58,  3.54s/it]

{'loss': 0.3232, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


                                                 
 33%|███▎      | 110/330 [08:18<12:58,  3.54s/it]

{'eval_loss': 0.32232218980789185, 'eval_accuracy': 0.8738738738738738, 'eval_f1': 0.9080675422138836, 'eval_runtime': 19.6694, 'eval_samples_per_second': 39.503, 'eval_steps_per_second': 0.356, 'epoch': 1.0}


 36%|███▋      | 120/330 [09:07<17:43,  5.06s/it]

{'loss': 0.2908, 'learning_rate': 1.2727272727272728e-05, 'epoch': 1.09}


 39%|███▉      | 130/330 [09:55<16:01,  4.81s/it]

{'loss': 0.2328, 'learning_rate': 1.2121212121212122e-05, 'epoch': 1.18}


 42%|████▏     | 140/330 [10:42<14:55,  4.71s/it]

{'loss': 0.2781, 'learning_rate': 1.1515151515151517e-05, 'epoch': 1.27}


 45%|████▌     | 150/330 [11:30<14:23,  4.80s/it]

{'loss': 0.2759, 'learning_rate': 1.0909090909090909e-05, 'epoch': 1.36}


 48%|████▊     | 160/330 [12:18<13:34,  4.79s/it]

{'loss': 0.2574, 'learning_rate': 1.0303030303030304e-05, 'epoch': 1.45}


 52%|█████▏    | 170/330 [13:06<12:38,  4.74s/it]

{'loss': 0.2684, 'learning_rate': 9.696969696969698e-06, 'epoch': 1.55}


 55%|█████▍    | 180/330 [13:53<11:48,  4.72s/it]

{'loss': 0.2838, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.64}


 58%|█████▊    | 190/330 [14:41<11:06,  4.76s/it]

{'loss': 0.28, 'learning_rate': 8.484848484848486e-06, 'epoch': 1.73}


 61%|██████    | 200/330 [15:29<10:27,  4.83s/it]

{'loss': 0.2672, 'learning_rate': 7.87878787878788e-06, 'epoch': 1.82}


 64%|██████▎   | 210/330 [16:16<09:28,  4.74s/it]

{'loss': 0.2664, 'learning_rate': 7.272727272727273e-06, 'epoch': 1.91}


 67%|██████▋   | 220/330 [17:00<06:40,  3.64s/it]

{'loss': 0.2355, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                                 
 67%|██████▋   | 220/330 [17:20<06:40,  3.64s/it]

{'eval_loss': 0.2924743890762329, 'eval_accuracy': 0.8777348777348777, 'eval_f1': 0.9085659287776707, 'eval_runtime': 19.5457, 'eval_samples_per_second': 39.753, 'eval_steps_per_second': 0.358, 'epoch': 2.0}


 70%|██████▉   | 230/330 [18:09<08:16,  4.96s/it]

{'loss': 0.1986, 'learning_rate': 6.060606060606061e-06, 'epoch': 2.09}


 73%|███████▎  | 240/330 [18:57<07:23,  4.92s/it]

{'loss': 0.2616, 'learning_rate': 5.4545454545454545e-06, 'epoch': 2.18}


 76%|███████▌  | 250/330 [19:45<06:23,  4.80s/it]

{'loss': 0.2283, 'learning_rate': 4.848484848484849e-06, 'epoch': 2.27}


 79%|███████▉  | 260/330 [20:33<05:34,  4.78s/it]

{'loss': 0.2156, 'learning_rate': 4.242424242424243e-06, 'epoch': 2.36}


 82%|████████▏ | 270/330 [21:20<04:45,  4.76s/it]

{'loss': 0.2185, 'learning_rate': 3.6363636363636366e-06, 'epoch': 2.45}


 85%|████████▍ | 280/330 [22:07<03:56,  4.72s/it]

{'loss': 0.2456, 'learning_rate': 3.0303030303030305e-06, 'epoch': 2.55}


 88%|████████▊ | 290/330 [22:55<03:10,  4.76s/it]

{'loss': 0.2629, 'learning_rate': 2.4242424242424244e-06, 'epoch': 2.64}


 91%|█████████ | 300/330 [23:42<02:21,  4.72s/it]

{'loss': 0.2278, 'learning_rate': 1.8181818181818183e-06, 'epoch': 2.73}


 94%|█████████▍| 310/330 [24:30<01:35,  4.80s/it]

{'loss': 0.2609, 'learning_rate': 1.2121212121212122e-06, 'epoch': 2.82}


 97%|█████████▋| 320/330 [25:18<00:47,  4.78s/it]

{'loss': 0.2362, 'learning_rate': 6.060606060606061e-07, 'epoch': 2.91}


100%|██████████| 330/330 [26:02<00:00,  3.66s/it]

{'loss': 0.2314, 'learning_rate': 0.0, 'epoch': 3.0}


                                                 
100%|██████████| 330/330 [26:22<00:00,  3.66s/it]

{'eval_loss': 0.300610214471817, 'eval_accuracy': 0.8777348777348777, 'eval_f1': 0.909437559580553, 'eval_runtime': 19.8658, 'eval_samples_per_second': 39.112, 'eval_steps_per_second': 0.352, 'epoch': 3.0}


100%|██████████| 330/330 [26:23<00:00,  4.80s/it]

{'train_runtime': 1583.7008, 'train_samples_per_second': 13.237, 'train_steps_per_second': 0.208, 'train_loss': 0.3063258080771475, 'epoch': 3.0}





TrainOutput(global_step=330, training_loss=0.3063258080771475, metrics={'train_runtime': 1583.7008, 'train_samples_per_second': 13.237, 'train_steps_per_second': 0.208, 'train_loss': 0.3063258080771475, 'epoch': 3.0})

## Step10 模型评估

In [16]:
trainer.evaluate(tokenized_datasets['test'])

100%|██████████| 7/7 [00:16<00:00,  2.35s/it]


{'eval_loss': 0.300610214471817,
 'eval_accuracy': 0.8777348777348777,
 'eval_f1': 0.909437559580553,
 'eval_runtime': 19.6255,
 'eval_samples_per_second': 39.591,
 'eval_steps_per_second': 0.357,
 'epoch': 3.0}

## Step11 模型预测

In [17]:
trainer.predict(tokenized_datasets['test'])

100%|██████████| 7/7 [00:16<00:00,  2.35s/it]


PredictionOutput(predictions=array([[-2.0883915,  2.6796248],
       [-1.3472697,  1.9983712],
       [-1.4260185,  1.2395971],
       ...,
       [-2.041272 ,  2.9313884],
       [-2.0050244,  2.3691764],
       [-2.357346 ,  3.2465975]], dtype=float32), label_ids=array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
    

In [19]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [20]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

In [21]:
pipe(sen)

[{'label': '好评！', 'score': 0.9950695037841797}]