# 多项选择

In [1]:
import torch 
import evaluate

from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments, DefaultDataCollator

In [None]:
dataset = DatasetDict.load_from_disk("../../datas/c3")

In [3]:
dataset.pop("test")

Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer'],
    num_rows: 1625
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [5]:
def process_func(examples):
    context_list = []
    qa_list = []
    labels = []

    for idx in range(len(examples["context"])):
        question = examples["question"][idx]
        context = "\n".join(examples["context"][idx])
        choices = examples["choice"][idx]

        # 构建QA对
        for choice in choices:
            context_list.append(context)
            qa_list.append(question + " " + choice)
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                context_list.append(context)
                qa_list.append(question + " " + "未知")

        # 指定Answer的index
        labels.append(choices.index(examples["answer"][idx]))

    inputs = tokenizer(
        context_list, 
        qa_list,
        max_length=384,
        padding=True,
        truncation="only_first")

    inputs = {k: [v[i: i + 4] for i in range(0, len(v), 4)]
              for k, v in inputs.items()}

    inputs["labels"] = labels
    return inputs

In [6]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/11869 [00:00<?, ? examples/s]

Map:   0%|          | 0/3816 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForMultipleChoice.from_pretrained("models/macbert-base")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at models/macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
accuracy = evaluate.load("metric_accuracy.py")

def compute_metrics(pred):
    preds, labels = pred
    predictions = preds.argmax(axis=-1)
    return accuracy.compute(
        predictions=predictions,
        references=labels
    )

In [11]:
args = TrainingArguments(
    output_dir="trained/model_for_multichoice",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    optim="adafactor",
    eval_steps=200,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    num_train_epochs=1,
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=200
)

In [12]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=DefaultDataCollator(),
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy
200,1.1177,0.951555,0.0061,0.57914


TrainOutput(global_step=371, training_loss=1.044719356732227, metrics={'train_runtime': 214.5942, 'train_samples_per_second': 55.309, 'train_steps_per_second': 1.729, 'total_flos': 9368511231366144.0, 'train_loss': 1.044719356732227, 'epoch': 1.0})

In [None]:
class MultiChoicePipeline:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, context, question, choices):
        context_list = []
        qa_list = []
        for choice in choices:
            qa_list.append(question + " " + choice)
            context_list.append(context)
        inputs = tokenizer(context_list, qa_list, max_length=384, truncation="only_first", return_tensors="pt")
        return inputs

    def predict(self, tokens):
        inputs = {
            k: v.unsqueeze(0).to(self.device) # 由[num_choice, seq_len] -> [batch_size, num_choice, seq_len] 匹配模型输入
            for k, v in tokens.items()
        }
        return self.model(**inputs).logits

    def postprocess(self, logits, choices):
        idx = logits.argmax(dim=-1).cpu().item()
        return choices[idx]

    def __call__(self, context, question, choices):
        tokens = self.preprocess(context, question, choices)
        logits = self.predict(tokens)
        result = self.postprocess(logits, choices)
        return result

In [28]:
pipe = MultiChoicePipeline(model, tokenizer)

In [29]:
pipe("小明在北京上班", "小明在哪里上班？", ["北京", "上海", "河北", "海南", "河北", "海南"])

'北京'