# 训练器

In [8]:
import torch
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [2]:
dataset = load_dataset("csv", data_files="../../datas/ChnSentiCorp_htl_all.csv", split="train")

dataset = dataset.filter(lambda x: x["review"] is not None)
dataset = dataset.train_test_split(0.1)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("./models/rbt3")

In [4]:
def process_func(examples):
    inputs = tokenizer(examples["review"], max_length=128, truncation=True, padding=True)
    inputs["labels"] = examples["label"]
    return inputs

In [5]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("./models/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
combine_metrics = evaluate.combine(["accuracy", "f1"])

def compute_metrics(pred):
    predictions, labels = pred
    print(type(predictions), type(labels))
    predictions = predictions.argmax(axis=-1)
    return combine_metrics.compute(
                references=labels, 
                predictions=predictions)

In [13]:
args = TrainingArguments(
    output_dir="./model_for_seqClass",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    learning_rate=2e-5,
    save_total_limit=3,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

In [14]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.262544,0.894466,0.925182
2,No log,0.260076,0.895753,0.926431
3,0.290800,0.257257,0.898327,0.927985


<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


TrainOutput(global_step=657, training_loss=0.2714890199890601, metrics={'train_runtime': 246.7385, 'train_samples_per_second': 84.964, 'train_steps_per_second': 2.663, 'total_flos': 351909933963264.0, 'train_loss': 0.2714890199890601, 'epoch': 3.0})

In [19]:
outputs = trainer.predict(tokenized_data["test"])

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [30]:
outputs

PredictionOutput(predictions=array([[ 0.96792436, -1.390158  ],
       [-3.0472317 ,  3.098472  ],
       [-2.2861655 ,  2.1577227 ],
       ...,
       [-3.082925  ,  3.0366619 ],
       [ 2.0107477 , -1.7506464 ],
       [-0.65538347,  0.32873866]], dtype=float32), label_ids=array([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1

In [29]:
trainer.evaluate(tokenized_data["test"])

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


{'eval_loss': 0.2572573721408844,
 'eval_accuracy': 0.8983268983268984,
 'eval_f1': 0.9279854147675478,
 'eval_runtime': 3.1824,
 'eval_samples_per_second': 244.154,
 'eval_steps_per_second': 7.856,
 'epoch': 3.0}

In [31]:
from transformers import pipeline

model.config.id2label = {0: "差评", 1: "好评"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [36]:
pipe("这地方还行")

[{'label': '好评', 'score': 0.8957759737968445}]