# QA问题

In [50]:
import torch

from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DefaultDataCollator, BertForQuestionAnswering, BertTokenizer

In [None]:
dataset = DatasetDict.load_from_disk("../../datas/mrc_data")

In [52]:
dataset["train"][0]

{'id': 'TRAIN_186_QUERY_0',
 'context': '范廷颂枢机（，），圣名保禄·若瑟（），是越南罗马天主教枢机。1963年被任为主教；1990年被擢升为天主教河内总教区宗座署理；1994年被擢升为总主教，同年年底被擢升为枢机；2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生；童年时接受良好教育后，被一位越南神父带到河内继续其学业。范廷颂于1940年在河内大修道院完成神学学业。范廷颂于1949年6月6日在河内的主教座堂晋铎；及后被派到圣女小德兰孤儿院服务。1950年代，范廷颂在河内堂区创建移民接待中心以收容到河内避战的难民。1954年，法越战争结束，越南民主共和国建都河内，当时很多天主教神职人员逃至越南的南方，但范廷颂仍然留在河内。翌年管理圣若望小修院；惟在1960年因捍卫修院的自由、自治及拒绝政府在修院设政治课的要求而被捕。1963年4月5日，教宗任命范廷颂为天主教北宁教区主教，同年8月15日就任；其牧铭为「我信天主的爱」。由于范廷颂被越南政府软禁差不多30年，因此他无法到所属堂区进行牧灵工作而专注研读等工作。范廷颂除了面对战争、贫困、被当局迫害天主教会等问题外，也秘密恢复修院、创建女修会团体等。1990年，教宗若望保禄二世在同年6月18日擢升范廷颂为天主教河内总教区宗座署理以填补该教区总主教的空缺。1994年3月23日，范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理；同年11月26日，若望保禄二世擢升范廷颂为枢机。范廷颂在1995年至2001年期间出任天主教越南主教团主席。2003年4月26日，教宗若望保禄二世任命天主教谅山教区兼天主教高平教区吴光杰主教为天主教河内总教区署理主教；及至2005年2月19日，范廷颂因获批辞去总主教职务而荣休；吴光杰同日真除天主教河内总教区总主教职务。范廷颂于2009年2月22日清晨在河内离世，享年89岁；其葬礼于同月26日上午在天主教河内总教区总主教座堂举行。',
 'question': '范廷颂是什么时候被任为主教的？',
 'answers': {'text': ['1963年'], 'answer_start': [30]}}

In [53]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [54]:
def process_func(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation="only_second",
        padding=True,
        return_offsets_mapping=True
    )

    offsets_mapping = inputs.pop("offset_mapping")
    start_postitions = []
    end_postitions = []

    for idx, offset_mapping in enumerate(offsets_mapping):
        # 定位原句子中答案的位置
        answer = examples["answers"][idx]
        answer_start_pos = answer["answer_start"][0]
        answer_end_pos = answer_start_pos + len(answer["text"][0])
        
        # 将tokenize后的QA对的答案定位到A的开始位置和结束位置
        context_start = inputs.sequence_ids(batch_index=idx).index(1)
        context_end = inputs.sequence_ids(batch_index=idx).index(None, context_start) - 1

        # QA对不包含原句子的答案
        if offset_mapping[context_end][1] < answer_start_pos or \
            offset_mapping[context_start][0] > answer_end_pos:
            start_pos = 0
            end_pos = 0
        
        # QA对包含原句子的答案
        else:
            # 左指针向原答案开始位置逼近
            tokenid = context_start
            while offset_mapping[tokenid][0] < answer_start_pos and tokenid <= context_end:
                tokenid += 1
            context_start = start_pos = tokenid

            # 右指针向原答案结束位置逼近
            tokenid = context_end
            while offset_mapping[tokenid][1] > answer_end_pos and tokenid >= context_start:
                tokenid -= 1
            context_end = end_pos = tokenid
        
        start_postitions.append(start_pos)
        end_postitions.append(end_pos)
    
    inputs["start_positions"] = start_postitions
    inputs["end_positions"] = end_postitions
    return inputs

In [55]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

In [56]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1002
    })
})

In [57]:
model = AutoModelForQuestionAnswering.from_pretrained("models/macbert-base")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at models/macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
args = TrainingArguments(
    output_dir="./trained/model_for_qa",
    per_device_eval_batch_size=32,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    save_steps=200,
    eval_steps=200,
    logging_steps=200,
    optim="adafactor",
    num_train_epochs=1
)

In [59]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=DefaultDataCollator()
)

In [60]:
trainer.train()

Step,Training Loss
200,1.9623


TrainOutput(global_step=317, training_loss=1.7391600984880224, metrics={'train_runtime': 178.3355, 'train_samples_per_second': 56.87, 'train_steps_per_second': 1.778, 'total_flos': 2650071706816512.0, 'train_loss': 1.7391600984880224, 'epoch': 1.0})

In [61]:
from transformers import pipeline

In [62]:
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [63]:
pipe(question="小明在哪里上班？", context="小明在北京上班。")



{'score': 0.31867194175720215, 'start': 3, 'end': 5, 'answer': '北京'}

# 滑动窗口版本

In [1]:
import torch

from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DefaultDataCollator, BertForQuestionAnswering

In [None]:
dataset = DatasetDict.load_from_disk("../../datas/mrc_data")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [4]:
outputs = tokenizer(["我是一只猫", "我是哆啦A梦", "我是一个粉刷匠"], ["快乐的星猫", "野比大雄", "我要把我滴房子"], return_overflowing_tokens=1, max_length=6)
outputs

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [[101, 2769, 102, 2571, 727, 102], [101, 3221, 102, 2571, 727, 102], [101, 3221, 102, 4638, 3215, 102], [101, 3221, 102, 4344, 102], [101, 671, 102, 2571, 727, 102], [101, 671, 102, 4638, 3215, 102], [101, 671, 102, 4344, 102], [101, 1372, 102, 2571, 727, 102], [101, 1372, 102, 4638, 3215, 102], [101, 1372, 102, 4344, 102], [101, 4344, 102, 2571, 727, 102], [101, 4344, 102, 4638, 3215, 102], [101, 4344, 102, 4344, 102], [101, 2769, 102, 4638, 3215, 102], [101, 2769, 102, 4344, 102], [101, 2769, 3221, 102, 7029, 102], [101, 1504, 1568, 102, 7029, 102], [101, 1504, 1568, 102, 3683, 102], [101, 1504, 1568, 102, 1920, 102], [101, 1504, 1568, 102, 7413, 102], [101, 143, 3457, 102, 7029, 102], [101, 143, 3457, 102, 3683, 102], [101, 143, 3457, 102, 1920, 102], [101, 143, 3457, 102, 7413, 102], [101, 2769, 3221, 102, 3683, 102], [101, 2769, 3221, 102, 1920, 102], [101, 2769, 3221, 102, 7413, 102], [101, 2769, 102, 2769, 6206, 102], [101, 3221, 102, 2769, 6206, 102], [101, 3221, 

In [5]:
dataset["train"][0]

{'id': 'TRAIN_186_QUERY_0',
 'context': '范廷颂枢机（，），圣名保禄·若瑟（），是越南罗马天主教枢机。1963年被任为主教；1990年被擢升为天主教河内总教区宗座署理；1994年被擢升为总主教，同年年底被擢升为枢机；2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生；童年时接受良好教育后，被一位越南神父带到河内继续其学业。范廷颂于1940年在河内大修道院完成神学学业。范廷颂于1949年6月6日在河内的主教座堂晋铎；及后被派到圣女小德兰孤儿院服务。1950年代，范廷颂在河内堂区创建移民接待中心以收容到河内避战的难民。1954年，法越战争结束，越南民主共和国建都河内，当时很多天主教神职人员逃至越南的南方，但范廷颂仍然留在河内。翌年管理圣若望小修院；惟在1960年因捍卫修院的自由、自治及拒绝政府在修院设政治课的要求而被捕。1963年4月5日，教宗任命范廷颂为天主教北宁教区主教，同年8月15日就任；其牧铭为「我信天主的爱」。由于范廷颂被越南政府软禁差不多30年，因此他无法到所属堂区进行牧灵工作而专注研读等工作。范廷颂除了面对战争、贫困、被当局迫害天主教会等问题外，也秘密恢复修院、创建女修会团体等。1990年，教宗若望保禄二世在同年6月18日擢升范廷颂为天主教河内总教区宗座署理以填补该教区总主教的空缺。1994年3月23日，范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理；同年11月26日，若望保禄二世擢升范廷颂为枢机。范廷颂在1995年至2001年期间出任天主教越南主教团主席。2003年4月26日，教宗若望保禄二世任命天主教谅山教区兼天主教高平教区吴光杰主教为天主教河内总教区署理主教；及至2005年2月19日，范廷颂因获批辞去总主教职务而荣休；吴光杰同日真除天主教河内总教区总主教职务。范廷颂于2009年2月22日清晨在河内离世，享年89岁；其葬礼于同月26日上午在天主教河内总教区总主教座堂举行。',
 'question': '范廷颂是什么时候被任为主教的？',
 'answers': {'text': ['1963年'], 'answer_start': [30]}}

In [6]:
def process_func(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding=True,
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        stride=128
    )

    overflow_tokens = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []
    example_ids = []

    for idx, sentence_id in enumerate(overflow_tokens):
        answer = examples["answers"][sentence_id]
        answer_start_pos = answer["answer_start"][0]
        answer_end_pos = answer_start_pos + len(answer["text"][0])
    
        context_start = inputs.sequence_ids(idx).index(1)
        context_end = inputs.sequence_ids(idx).index(None, context_start) - 1
        offset_mapping = inputs["offset_mapping"][idx]

        if offset_mapping[context_end][1] < answer_start_pos or offset_mapping[context_start][0] > answer_end_pos:
            start_pos = 0
            end_pos = 0
        
        else:
            tokenid = context_start
            while offset_mapping[tokenid][0] < answer_start_pos and tokenid <= context_end:
                tokenid += 1
            context_start = start_pos = tokenid

            tokenid = context_end
            while offset_mapping[tokenid][1] > answer_end_pos and tokenid >= context_start:
                tokenid -= 1
            context_end = end_pos = tokenid

        start_positions.append(start_pos)
        end_positions.append(end_pos)
        example_ids.append(examples["id"][sentence_id]) 
        inputs["offset_mapping"][idx] = [
            (v if inputs.sequence_ids(idx)[k] == 1 else None)
            for k, v in enumerate(inputs["offset_mapping"][idx])
        ]

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["example_ids"] = example_ids
    return inputs

In [7]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [8]:
print(tokenized_data["train"][0])

{'input_ids': [101, 5745, 2455, 7563, 3221, 784, 720, 3198, 952, 6158, 818, 711, 712, 3136, 4638, 8043, 102, 5745, 2455, 7563, 3364, 3322, 8020, 8024, 8021, 8024, 1760, 1399, 924, 4882, 185, 5735, 4449, 8020, 8021, 8024, 3221, 6632, 1298, 5384, 7716, 1921, 712, 3136, 3364, 3322, 511, 9155, 2399, 6158, 818, 711, 712, 3136, 8039, 8431, 2399, 6158, 3091, 1285, 711, 1921, 712, 3136, 3777, 1079, 2600, 3136, 1277, 2134, 2429, 5392, 4415, 8039, 8447, 2399, 6158, 3091, 1285, 711, 2600, 712, 3136, 8024, 1398, 2399, 2399, 2419, 6158, 3091, 1285, 711, 3364, 3322, 8039, 8170, 2399, 123, 3299, 4895, 686, 511, 5745, 2455, 7563, 754, 9915, 2399, 127, 3299, 8115, 3189, 1762, 6632, 1298, 2123, 2398, 4689, 1921, 712, 3136, 1355, 5683, 3136, 1277, 1139, 4495, 8039, 4997, 2399, 3198, 2970, 1358, 5679, 1962, 3136, 5509, 1400, 8024, 6158, 671, 855, 6632, 1298, 4868, 4266, 2372, 1168, 3777, 1079, 5326, 5330, 1071, 2110, 689, 511, 5745, 2455, 7563, 754, 9211, 2399, 1762, 3777, 1079, 1920, 934, 6887, 7368, 213

In [35]:
import collections # 聚合答案

def gather_result(start_logits, end_logits, dataset, tokenized_data):
    predictions = {}
    references = {}

    example_to_tokens = collections.defaultdict(list)
    for idx, example_id in enumerate(tokenized_data["example_ids"]):
        example_to_tokens[example_id].append(idx)

    n_best = 20
    max_ans_length = 30

    for data in dataset:
        example_id = data["id"]
        answers = []

        for idx in example_to_tokens[example_id]:
            offset_mapping = tokenized_data[idx]["offset_mapping"]
            token_start_logits = start_logits[idx].argsort()[::-1][:n_best].tolist()
            token_end_logits = end_logits[idx].argsort()[::-1][:n_best].tolist()

            for start_pos in token_start_logits:
                for end_pos in token_end_logits:
                    if (end_pos < start_pos) or (end_pos - start_pos + 1 > max_ans_length):
                        continue
                    if offset_mapping[start_pos] is None or offset_mapping[end_pos] is None:
                        continue
                    answers.append({
                        "text": data["context"][offset_mapping[start_pos][0]: offset_mapping[end_pos][1]],
                        "score": start_logits[idx][start_pos] + end_logits[idx][end_pos]
                    })
            
            if len(answers) > 0:
                best_answer = max(answers, key=lambda x: x["score"])
                predictions[example_id] = best_answer["text"]
            else:
                predictions[example_id] = ""
            references[example_id] = data["answers"]["text"]

    return predictions, references

# trainer.evaluate(tokenized_data["test"])

In [36]:
!pip install nltk

1545.27s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0m

In [37]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
from cmrc_eval import evaluate_cmrc

def compute_metrics(pred):
    start_logits, end_logits = pred[0]

    label = "validation" if start_logits.shape[0] == len(tokenized_data["validation"]) else "test"
    predictions, labels = gather_result(start_logits, end_logits, dataset[label], tokenized_data[label])

    return evaluate_cmrc(
        predictions=predictions,
        references=labels
    )

In [39]:
model = AutoModelForQuestionAnswering.from_pretrained("models/macbert-base")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at models/macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
args = TrainingArguments(
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    output_dir="./trained/model_for_qa_slide",
    gradient_checkpointing=True,
    optim="adafactor",
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    num_train_epochs=1,
    logging_steps=100
)

In [41]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=DefaultDataCollator(),
    compute_metrics=compute_metrics
)

In [42]:
trainer.train()

Step,Training Loss,Validation Loss,Avg,F1,Em,Total,Skip
200,1.5784,1.27977,75.816359,85.028182,66.604536,3219,0
400,1.3992,1.174158,78.587818,87.402414,69.773221,3219,0
600,1.3163,1.167351,77.824613,87.398216,68.25101,3219,0


TrainOutput(global_step=600, training_loss=1.5798635609944662, metrics={'train_runtime': 316.0931, 'train_samples_per_second': 60.707, 'train_steps_per_second': 1.898, 'total_flos': 3760517598755328.0, 'train_loss': 1.5798635609944662, 'epoch': 1.0})

In [43]:
trainer.evaluate(tokenized_data["test"])

{'eval_loss': 1.415410041809082,
 'eval_avg': 51.84703984041357,
 'eval_f1': 69.46254275467945,
 'eval_em': 34.231536926147704,
 'eval_total': 1002,
 'eval_skip': 0,
 'eval_runtime': 10.1608,
 'eval_samples_per_second': 195.653,
 'eval_steps_per_second': 6.2,
 'epoch': 1.0}

In [47]:
from transformers import pipeline

In [48]:
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [49]:
pipe(question="小明在哪里上班？", context="小明在北京上班。")

{'score': 0.4139211177825928, 'start': 3, 'end': 5, 'answer': '北京'}