# 对数据处理
**注意**：要传入model中的input和label是什么样子的

In [None]:
# 数据集使用的是squad

from datasets import load_dataset

data_file = {
                "train":"your path/data/plain_text/train-00000-of-00001.parquet",
                "val":"your path/plain_text/validation-00000-of-00001.parquet"
             }

dataset = load_dataset("parquet", data_files=data_file)
train_test_dataset = dataset["train"].train_test_split(0.3)
validation_dataset = dataset["val"]
train_test_dataset, validation_dataset

In [None]:
# answer_start也是其实字符char的位置， 起始char + len(text) 得到末尾char
train_test_dataset["train"][0]

**需要面临的问题**
1. 有一些文本比较长超过了模型可以接收的最大长度
2. 需要根据提供的answers中的answer_start和answer_text映射出答案再上下文中的索引
3. 找出那一部分对应于上下文，哪些部分对应于问题

In [None]:
# 模型使用的是DistilBERT

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("your model path")

In [None]:
def process_train_function(examples):
    questions = [q.strip() for q in examples["question"]]

    # 只对传入的文本进行截断
    # return_offsets_mapping => 返回ids对字符串char之间的关系，也就是说一个ids对应原文char的索引，特殊字符用(0,0)表示且不当作字符串来表示
    inputs = tokenizer(
            questions,
            examples["context"],
            truncation="only_second",
            max_length=384,
            stride=120,
            padding="max_length",
            return_offsets_mapping=True,
            return_overflowing_tokens=True
        )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    answers = examples["answers"]

    start_positions = []
    end_positions = []

    # batched=True
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        # 拿到来源于哪个记录的答案
        answer = answers[sample_idx]
        # 注意：这里标注的是char而不是ids
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # sequence_ids会拿到这个ids是属于inputs里面传入questions还是context呢，会使用0和1来标记，特殊字符使用None标记
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        content_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        content_end = idx -1
        # 到上面那一步会仍然面临这char和ids对不上的问题，也就是说，多个char可能会对应一个idx

        # offset是词和ids的对应关系
        if offset[content_start][0] > end_char or offset[content_end][1] < start_char:
            # 这个判断是答案不在文章中
            start_positions.append(0)
            end_positions.append(0)
        else:
            # content_start, content_end => 偏移量中的索引
            # start_char, end_char => 文章中的字符索引
            idx = content_start
            # 记录文章的开始索引
            while idx < content_end and offset[idx][0] <= start_char:
                # idx < content_end => 在答案的offset中寻找
                # offset[idx][0] <= start_char =>看答案的offset中的首偏移量小于或等于开始的索引
                # 应该是在答案的第二个偏移量退出
                idx += 1
            start_positions.append(idx - 1)

            idx = content_end
            # 记录文章的结束索引
            while idx > content_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)


    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs



In [None]:
def process_validation_function(examples):
    """
        整体干了什么工作？
        1. 拿到了不同片段的id【因为一个语句会被分解成为多个lst】=> example_id
        2. 将offset中答案的部分设置成为None
    """
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=120,
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]    
    inputs["example_id"] = example_ids
    return inputs

In [None]:
tokenized_dataset_train_test = train_test_dataset.map(process_train_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset_validation = validation_dataset.map(process_validation_function, batched=True, remove_columns=validation_dataset.column_names)

# 训练模型

In [None]:
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("your model path")
data_collator = DefaultDataCollator()

In [None]:
training_args = TrainingArguments(
    output_dir="./checkout",
    per_device_train_batch_size=2,
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=3e-5,
    num_train_epochs=3,
    warmup_steps=15,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=25
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train_test["train"],
    eval_dataset=tokenized_dataset_train_test["test"].select(range(100)),
    data_collator=data_collator,
)

In [None]:
trainer.train()

## 对数据集进行后处理

In [None]:
import evaluate
metrics = evaluate.load("../../evaluate/squad")
metrics

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
predictions, _, _ = trainer.predict(tokenized_dataset_validation)
start_logits, end_logits = predictions
tokenized_dataset_validation


In [None]:
import numpy as np
import collections
from tqdm.auto import tqdm

def compute_metrics(metric,start_logits, end_logits, features, examples, n_best=20, max_answer_length=30):
    """
        examples用的是带着answer中具体答案的
        features是tokenized的数据
    """
    # example_to_features存储的是id:[数据集中索引]
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)
    print(example_to_features)
    predicted_answers = []
    for example in tqdm(examples):

        # 原始数据中的一个id以及content
        example_id = example["id"]
        content = example["context"]

        # 一个实例中的很多答案
        answers = []

        for feature_index in example_to_features[example_id]:
            # feature_index 是一个列表，其中存储了由于tokenizer中传递的stride分割成的数据索引
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best-1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best : -1].tolist()

            # 在process_validation_function中是根据char_id 和 offset 去映射 开始的char在offset中的索引位置
            # 这里的处理是已知开始的char索引位置以及offset找到char_id之后去找开始和结束的内容
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (end_index < start_index) or ((end_index - start_index + 1) > max_answer_length):
                        continue
                    answer = {
                        "text":content[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score":start_logit[start_index] + end_logit[end_index]
                    }
                    # 假设开始的索引确定的话后面的索引可能会很多，所以需要在这里添加到列表中
                    answers.append(answer)

        # 拿到分数最高的答案
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x:x["logit_score"])
            predicted_answers.append(
                {"id":example_id, "prediction_text":best_answer["text"]}
            )
        else:
            predicted_answers.append(
                {"id":example_id, "prediction_text":""}
            )
    therotical_answers = [{"id":ex["id"], "answers":ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=therotical_answers)



In [None]:
compute_metrics(metrics, start_logits, end_logits, tokenized_dataset_validation, validation_dataset)