# 基于Transformers的多项选择

## Step1 导入相关包

In [None]:
from datasets import DatasetDict
from transformers import AutoTokenizer,AutoModelForMultipleChoice,TrainingArguments,Trainer

## Step2 加载数据集

In [None]:
c3 = DatasetDict.load_from_disk('./c3')
c3.pop("test") # 不然会报错
c3

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

In [31]:
c3['train'][0]

{'id': 0,
 'context': ['男：你今天晚上有时间吗?我们一起去看电影吧?', '女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。所以……'],
 'question': '女的最喜欢哪种电影?',
 'choice': ['恐怖片', '爱情片', '喜剧片', '科幻片'],
 'answer': '喜剧片'}

## Step3 数据集预处理

In [32]:
tokenizer = AutoTokenizer.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base')
tokenizer

BertTokenizerFast(name_or_path='D:/pretrained_model/models--hfl--chinese-macbert-base', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [33]:
c3['train'][0], 

({'id': 0,
  'context': ['男：你今天晚上有时间吗?我们一起去看电影吧?', '女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。所以……'],
  'question': '女的最喜欢哪种电影?',
  'choice': ['恐怖片', '爱情片', '喜剧片', '科幻片'],
  'answer': '喜剧片'},)

In [34]:
def process_function(examples):

    contents = []
    question_choices = []
    labels = []
    for idx in range(len(examples['context'])):
        # 正文 + 问题 
        content = "\n".join(examples['context'][idx])
        question = examples['question'][idx]
        choices = examples['choice'][idx]
        answer = examples['answer'][idx]

        for choice in choices:
            contents.append(content)
            question_choices.append(question + ':' + choice)
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                contents.append(content)
                question_choices.append(question + ' ' + 'none')

        labels.append(choices.index(answer))

    # 这里content和question_choices是可以支持列表的形式的
    tokenized_examples = tokenizer(contents, question_choices, truncation='only_first', max_length=256, padding='max_length')
    tokenized_examples = {k:[v[i : i + 4] for i in range(0, len(v), 4)] for k,v in tokenized_examples.items()}
    # 因为是多项选择题，期待的格式：
    # 题目+选项1
    # 题目+选项2
    # 题目+选项...
    # 题目+选项N
    tokenized_examples['labels'] = labels
    return tokenized_examples

In [35]:
res = c3['train'].select(range(10)).map(process_function, batched=True)
res

Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [36]:
import numpy as np
np.array(res['input_ids']).shape

(10, 4, 256)

In [37]:
tokenized_c3 = c3.map(function=process_function, batched=True)
tokenized_c3

Map:   0%|          | 0/11869 [00:00<?, ? examples/s]

Map:   0%|          | 0/3816 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3816
    })
})

## Step4 创建模型

In [39]:
model = AutoModelForMultipleChoice.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base')

  return self.fget.__get__(instance, owner)()
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at D:/pretrained_model/models--hfl--chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step5 创建评估函数

In [44]:
import evaluate

accurcy = evaluate.load('accuracy')

def compute_metric(pred):
    predictions, label = pred
    predictions = np.argmax(predictions, dim=-1)
    return accurcy.compute(predictions=predictions, references=label)

FileNotFoundError: Couldn't find a module script at d:\AI\NLP\LLM-RoadMap\NLP_tasks\accuracy\accuracy.py. Module 'accuracy' doesn't exist on the Hugging Face Hub either.

## Step6 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir="./muliple_choice",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True
)

## Step7 创建训练器

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_c3['train'],
    eval_dataset=tokenized_c3['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metric,
    )

## Step8 模型训练

In [None]:
trainer.train()

## Step9 模型预测

In [45]:
from typing import Any
import torch

class MultipleChoicePipeline:
    def __init__(self, model, tokenizer) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
        pass

    def preprocess(self, context, quesiton, choices):
        cs, qs = [], []
        for ch in choices:
            cs.append(context)
            qs.append(quesiton + ':' + ch)

        return self.tokenizer(cs, qs, truncation='only_first', max_length=256, padding='max_length')

    def predict(self, inputs):
        inputs = {k:v.unsqueeze(0).to(self.device) for k, v in inputs.items()}
        return self.model(**inputs).logits

    def postprocess(self, logits, choices):
        prediction = torch.argmax(logits, dim=-1).cpu().item()
        return choices[prediction]

    def __call__(self, context, question, choices) -> Any:
        inputs = self.preprocess(context, question, choices)
        logits = self.predict(inputs)
        result = self.postprocess(logits, choices)
        return result

In [46]:
pipp = MultipleChoicePipeline(model, tokenizer)

In [None]:
pipe("小明在北京上班", "小明在哪里上班？", ["北京", "上海", "河北", "海南", "河北", "海南"])