# Text classification

### Import libraries

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

### Import data

In [2]:
dataset = load_dataset("csv", data_files = "./ChnSentiCorp_htl_all.csv", split = "train")
# All the data will be attributed to "train" by default, but adding split = "train", we take the "train" only
dataset = dataset.filter(lambda x: x["review"] is not None)

In [3]:
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [4]:
dataset[:5]

{'label': [1, 1, 1, 1, 1],
 'review': ['距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.',
  '商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!',
  '早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。',
  '宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~',
  'CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风']}

In [5]:
len(dataset)

7765

### Split data

In [6]:
datasets = dataset.train_test_split(test_size = 0.1)

In [7]:
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

### Preprocess data

In [8]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

In [9]:
# # For those who cannot access Huggingface, you can use modelscope as the substitue
# from modelscope.hub.snapshot_download import snapshot download
# snapshot_download(model_id = "dienstag/rbt3", cache_dir = "./models")
# tokenizer = AutoTokenizer.from_pretrained("./models/diestag/rbt3")

In [10]:
tokenizer

BertTokenizerFast(name_or_path='hfl/rbt3', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length = 128, truncation = True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

In [12]:
datasets["train"].column_names

['label', 'review']

In [13]:
tokenized_datasets = datasets.map(process_function, batched = True, remove_columns = datasets["train"].column_names)

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [15]:
print(tokenized_datasets["test"][1])

{'input_ids': [101, 6421, 3818, 2421, 6392, 3177, 679, 1059, 684, 679, 3815, 1112, 117, 6375, 2145, 2787, 2697, 6230, 679, 1922, 3140, 3123, 2552, 886, 4500, 117, 684, 3302, 1218, 782, 1447, 2578, 2428, 679, 3221, 1922, 1962, 117, 3193, 677, 1057, 857, 2145, 782, 3119, 2896, 2791, 7313, 4638, 6862, 2428, 1922, 2714, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0}


### Create model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The line above means “Load the base model (rbt3) and attach a classification head suitable for sequence-level classification tasks.”

In [17]:
for param in model.parameters():
    param.data = param.data.contiguous()

In PyTorch, tensors can be non-contiguous after operations like `.transpose()`, `.permute()`, or slicing. This means their memory layout isn't stored in a single, linear block — which can slow down certain operations or cause issues with low-level libraries (like custom CUDA kernels or ONNX export).

Calling `.contiguous()` ensures the tensor is laid out in memory in a standard, row-major format.

### Set up evaluation function

In [18]:
import evaluate

In [19]:
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

These metrcis are downloaded from huggingface. If you cannot access it, you can download the metrics foldoer https://github.com/huggingface/evaluate/tree/main/metrics to your local drive, and execute the code below.

In [20]:
# acc_metric = evaluate.load("./metrics/accuracy")
# f1_metric = evaluate.load("./metrics/f1")

In [21]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis = -1)
    acc = acc_metric.compute(predictions = predictions, references = labels)
    f1 = f1_metric.compute(predictions = predictions, references = labels)
    acc.update(f1) # return both metrics in a single dictionary, .update() is a merging trick for Python dictionay
    return acc
    

### Configure training params

In [29]:
args = TrainingArguments(
    output_dir = "./checkpoints",
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 128,
    logging_steps = 10,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3, # Only the recent 3 models will be saved
    learning_rate = 2e-5,
    weight_decay = 0.01,
    metric_for_best_model = "f1",
    load_best_model_at_end = True , 
)
"""
the best model will be preserved, even if it's not among the most recent 3 checkpoints
as long as load_best_model_at_end=True is set.
"""

"\nthe best model will be preserved, even if it's not among the most recent 3 checkpoints\nas long as load_best_model_at_end=True is set.\n"

In [30]:
args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp1

### Create trainer

In [31]:
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
    data_collator = DataCollatorWithPadding(tokenizer = tokenizer),
    compute_metrics = eval_metric
)

### Train the model

In [32]:
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2702,0.297871,0.876448,0.909605
2,0.2424,0.279889,0.890605,0.919278
3,0.2648,0.291758,0.888031,0.918463


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=330, training_loss=0.29046227715232154, metrics={'train_runtime': 230.634, 'train_samples_per_second': 90.897, 'train_steps_per_second': 1.431, 'total_flos': 351909933963264.0, 'train_loss': 0.29046227715232154, 'epoch': 3.0})

### Evaluate the model

In [33]:
trainer.evaluate(tokenized_datasets["test"])

  return forward_call(*args, **kwargs)


{'eval_loss': 0.27988937497138977,
 'eval_accuracy': 0.8906048906048906,
 'eval_f1': 0.9192782526115859,
 'eval_runtime': 2.3037,
 'eval_samples_per_second': 337.277,
 'eval_steps_per_second': 3.039,
 'epoch': 3.0}

### Use the model to predict

In [34]:
trainer.predict(tokenized_datasets["test"])

  return forward_call(*args, **kwargs)


PredictionOutput(predictions=array([[ 1.6808631 , -2.2803416 ],
       [ 1.1263554 , -1.0340532 ],
       [-2.2555542 ,  2.5828536 ],
       ...,
       [-0.15238175,  0.8892967 ],
       [-1.9123695 ,  2.2060564 ],
       [-1.9547391 ,  2.4958074 ]], dtype=float32), label_ids=array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0

### Predict with single sample

In [35]:
import torch

In [41]:
sen = "这个面馆的炸酱面还可以，酱料很香。"
ids_label = {0: "negative", 1: "positive"}

model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors = "pt")
    inputs = {k: v.to("mps") for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim = -1)
    print(f"Input: {sen}\nPredicted result: {ids_label.get(pred.item())}")

Input: 这个面馆的炸酱面还可以，酱料很香。
Predicted result: positive


Use huggingface pipeline to predict

In [43]:
from transformers import pipeline

In [44]:
model.config.id2label = ids_label
pipe = pipeline("text-classification", model = model, tokenizer = tokenizer)

Device set to use mps:0


In [48]:
sen = "这个面馆的炸酱面还可以，但是酱有点咸。"
pipe(sen)

[{'label': 'positive', 'score': 0.9410847425460815}]

In [50]:
sen = "这个面馆的炸酱面还可以，但是酱有点咸，我不怎么喜欢。"
pipe(sen)

[{'label': 'positive', 'score': 0.8703807592391968}]

In [51]:
sen = "这个面馆的炸酱面有点咸，我不怎么喜欢。"
pipe(sen)

[{'label': 'positive', 'score': 0.6036363840103149}]

In [52]:
sen = "这个面馆的炸酱面有点咸，我不喜欢。"
pipe(sen)

[{'label': 'positive', 'score': 0.6536489129066467}]

In [53]:
sen = "这个面馆的炸酱面我不喜欢。"
pipe(sen)

[{'label': 'positive', 'score': 0.5502287745475769}]

In [54]:
sen = "这个面馆的炸酱面太难吃了。"
pipe(sen)

[{'label': 'positive', 'score': 0.5748376250267029}]

In [55]:
sen = "这个面馆的炸酱面太难吃了，差评！"
pipe(sen)

[{'label': 'negative', 'score': 0.8792062401771545}]