# 文本分类任务

降低显存消耗的办法

1. TrainingArguments打开checkout_pointing
2. 采用小批次，使用梯度累加
3. 改用占现存小的优化器
4. 冻结部分参数
5. 调整输入长度

In [14]:
import torch
import evaluate

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [6]:
dataset = load_dataset("csv", data_files="../../datas/ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset = dataset.train_test_split(0.1)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("models/rbt3")

In [8]:
def process_func(examples):
    inputs = tokenizer(examples["review"], max_length=128, truncation=True, padding=True)
    inputs["labels"] = examples["label"]
    return inputs

In [9]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("./models/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
combined_metrics = evaluate.combine(["accuracy", "f1"])

def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions.argmax(axis=-1)
    return combined_metrics.compute(
        predictions=predictions, references=labels)

In [13]:
args = TrainingArguments(
    output_dir="./trained/model_for_seqClass",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    save_total_limit=3,
    save_strategy="epoch",
    eval_strategy="epoch",
    weight_decay=0.001,
    logging_steps=50,
    learning_rate=2e-5,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    optim="adafactor"
)

In [15]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3337,0.316232,0.862291,0.894581
2,0.2635,0.257256,0.903475,0.930362
3,0.2354,0.248452,0.895753,0.923944


TrainOutput(global_step=330, training_loss=0.3053307432116884, metrics={'train_runtime': 253.009, 'train_samples_per_second': 82.859, 'train_steps_per_second': 1.304, 'total_flos': 351909933963264.0, 'train_loss': 0.3053307432116884, 'epoch': 3.0})

In [17]:
from transformers import pipeline

model.config.id2label = {0: "差评", 1: "好评"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [18]:
pipe("这地方有点差劲")

[{'label': '差评', 'score': 0.8530417680740356}]