# 基于Transformers的阅读理解实现-滑窗处理

## step1 导入包

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator

## step2 加载数据集

In [None]:
datasets = load_dataset('cmrc2018', cache_dir='./data')
datasets

In [None]:
print(datasets['train'][0])

## step3 数据集预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base')

In [None]:
sample_dataset = datasets['train'].select(range(5))
sample_dataset

In [None]:
tokenizer_examples = tokenizer(text=sample_dataset['question'],
                               text_pair=sample_dataset['context'],
                               max_length=512,
                               truncation='only_second',
                               return_offsets_mapping=True,
                               return_overflowing_tokens = True,
                               stride=128,
                               padding='max_length')

In [None]:
tokenizer_examples['overflow_to_sample_mapping']

In [None]:
tokenizer_examples['offset_mapping']

In [None]:
tokenizer_examples['overflow_to_sample_mapping']

In [None]:
for sen in tokenizer.batch_decode(tokenizer_examples['input_ids']):
    print(sen)

In [None]:
sample_mapping = tokenizer_examples.pop("overflow_to_sample_mapping")

In [None]:
sample_dataset[0]

In [None]:
# 处理截断的过程中，答案被截断的情况

for idx, _ in enumerate(sample_mapping):
    answer = sample_dataset['answers'][sample_mapping[idx]]
    start_char = answer['answer_start'][0]
    end_char = start_char + len(answer['text'][0])

    # 定位答案再token中的起始位置和结束位置
    # 一种策略，我们要拿到context的起始和终止位置，然后左右逼近
    # sequence_ids(idx) 方法返回一个列表，用于指示第 idx 个示例的每个分词后的 token 属于哪个输入序列。

    context_start = tokenizer_examples.sequence_ids(idx).index(1)
    context_end = tokenizer_examples.sequence_ids(idx).index(None, context_start) - 1 # 从context_start开始查找第一个出现的None

    offset = tokenizer_examples.get("offset_mapping")[idx]

    # 判断答案不在上下文中
    if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
        start_token_pos = 0
        end_token_pos = 0
    else:
        # 找到包含答案起始位置的上下文的开头
        token_id = context_start
        while token_id <= context_end and offset[token_id][0] < start_char:
            token_id += 1
        start_token_pos = token_id

        # 找到包含答案终止位置的上下文的结束
        token_id = context_end
        while token_id >= context_start and offset[token_id][1] > end_char:
            token_id -= 1
        end_token_pos = token_id
    
    print(answer, start_char, end_char, context_start, context_end, start_token_pos, end_token_pos)
    print("token answer decode:", tokenizer.decode(tokenizer_examples['input_ids'][idx][start_token_pos : end_token_pos + 1]))
    

In [None]:
def process_function(examples):
    tokenizer_examples = tokenizer(text=examples['question'],
                                    text_pair=examples['context'],
                                    return_offsets_mapping=True,
                                    return_overflowing_tokens = True,
                                    stride=128,
                                    max_length=512,
                                    truncation='only_second',
                                    padding='max_length'
                                   )
    sample_mapping = tokenizer_examples.pop('overflow_to_sample_mapping')

    # 因为后面会使用到tokenizer batch的方式进行处理
    start_position = []
    end_position = []
    examples_ids = [] # 用于记录答案是在原来段落中的哪个段落的
    for idx, _ in enumerate(sample_mapping):
        answer = examples["answers"][sample_mapping[idx]]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # 定位答案再token中的起始位置和结束位置
        # 一种策略，我们要拿到context的起始和终止位置，然后左右逼近
        # sequence_ids(idx) 方法返回一个列表，用于指示第 idx 个示例的每个分词后的 token 属于哪个输入序列。[none,0000,none,11111] 类似这种数据

        context_start = tokenizer_examples.sequence_ids(idx).index(1)
        context_end = tokenizer_examples.sequence_ids(idx).index(None, context_start) - 1 # 从context_start开始查找第一个出现的None
        offset = tokenizer_examples.get("offset_mapping")[idx] # 拿到分词后的偏移量

        # 判断答案不在上下文中
        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            start_token_pos = 0
            end_token_pos = 0
        else:
            # 找到包含答案起始位置的上下文的开头
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id

            # 找到包含答案终止位置的上下文的结束
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id

        start_position.append(start_token_pos)
        end_position.append(end_token_pos)
        examples_ids.append(examples['id'][sample_mapping[idx]])
        ## 用于标记答案在cls token上的情况，对应的位置标记为None, 这里涉及到的是 cls+question+sep+context+sep 的句子
        # 并将非句子部分的内容标记为None
        tokenizer_examples['offset_mapping'][idx] = [
            (o if tokenizer_examples.sequence_ids(idx)[k] == 1 else None)
            for k, o in enumerate(tokenizer_examples['offset_mapping'][idx])
        ]
    
    tokenizer_examples["example_ids"] = examples_ids
    tokenizer_examples['start_positions'] = start_position
    tokenizer_examples['end_positions'] = end_position
    return tokenizer_examples


In [None]:
tokenizer_datasets = datasets.map(function=process_function, batched=True, remove_columns=datasets['train'].column_names)
tokenizer_datasets

## step4 创建模型

In [None]:
import numpy as np
import collections

def get_result(start_logits, end_logits, exmaples, features):

    predictions = {}
    references = {}

    # example 和 feature的映射
    example_to_feature = collections.defaultdict(list)
    for idx, example_id in enumerate(features["example_ids"]):
        example_to_feature[example_id].append(idx)

    # 最优答案候选
    n_best = 20
    # 最大答案长度
    max_answer_length = 30

    for example in exmaples:
        example_id = example["id"]
        context = example["context"]
        answers = []
        for feature_idx in example_to_feature[example_id]:
            start_logit = start_logits[feature_idx]
            end_logit = end_logits[feature_idx]
            offset = features[feature_idx]["offset_mapping"]
            '''
            对 start_logit 和 end_logit 数组进行 降序排序，以找出得分最高的起始位置和结束位置。
            然后，通过选择 前 n_best 个候选位置 来生成可能的答案边界，这样可以获得多个候选答案，之后再根据得分筛选出最佳答案。
            '''
            start_indexes = np.argsort(start_logit)[::-1][:n_best].tolist()
            end_indexes = np.argsort(end_logit)[::-1][:n_best].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset[start_index] is None or offset[end_index] is None:
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # offset的长度和分词后的长度是一致的，因此start_index&end_indexes其实是分词后的答案索引信息，通过索引信息还原原文
                    answers.append({
                        "text": context[offset[start_index][0]: offset[end_index][1]],
                        "score": start_logit[start_index] + end_logit[end_index]
                    })
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["score"])
            predictions[example_id] = best_answer["text"]
        else:
            predictions[example_id] = ""
        references[example_id] = example["answers"]["text"]

    return predictions, references

In [None]:
from cmrc_eval import evaluate_cmrc

def metirc(pred):
    start_logits, end_logits = pred[0]
    if start_logits.shape[0] == len(tokenizer_datasets["validation"]):
        p, r = get_result(start_logits, end_logits, datasets["validation"], tokenizer_datasets["validation"])
    else:
        p, r = get_result(start_logits, end_logits, datasets["test"], tokenizer_datasets["test"])
    return evaluate_cmrc(p, r)

In [None]:


model = AutoModelForQuestionAnswering.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base')

args = TrainingArguments(
    output_dir='./models_for_qa',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    max_steps=800
)

trainer = Trainer(
    model=model,
    args = args,
    tokenizer = tokenizer,
    train_dataset= tokenizer_datasets['train'],
    eval_dataset= tokenizer_datasets['validation'],
    data_collator=DefaultDataCollator(),
    compute_metrics=metirc
)

trainer.train()

## step9 模型推理

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)

pipe(question="小明在哪里上班？", context="小明在上海工作过，现在在深圳做了。")