In [35]:

import torch
import datasets
import os, sys
import json

from datasets import load_dataset, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint

from typing import List, Tuple

from torch.utils.data import DataLoader

In [None]:
metric = load_metric("squad")
metric

### Load dataset

In [54]:
def get_squad_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        question = qa['question']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((context, question, answer, answer_start))
    return context_qa_pairs

In [48]:
XQUAD_BASE_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/data/xquad/xx/'
XQUAD_CACHE_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/data/xquad/_cache/'
xquad_en = json.load(open(os.path.join(XQUAD_BASE_DIR, 'xquad.en.json'), 'r'))
xquad_en.keys(), \
xquad_en['version']


xquad_xx = {}
XQUAD_LANGS = ['ar', 'de', 'el', 'en', 'es', 'hi', 'ro', 'ru', 'th', 'tr', 'vi', 'zh']
for lang in XQUAD_LANGS:
    xquad_xx[f'{lang}'] = json.load(open(os.path.join(XQUAD_BASE_DIR, f'xquad.{lang}.json'), 'r'))['data']


In [49]:
features = datasets.Features({'question': str, 'context': str, 'answer': str})
features

{'question': str, 'context': str, 'answer': str}

In [50]:
raw_datasets = { lang: datasets.Dataset.from_dict(mapping=data) for lang, data in xquad_xx.items() }
raw_datasets.keys()

AttributeError: 'list' object has no attribute 'items'

### Process

In [5]:
def generate_input(_question:str, _context:str):
    return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])

generate_input('What is the capital city of Thailand?','เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)')

'question: What is the capital city of Thailand? context: เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)'

In [14]:
PRETRAINED_TOKENIZER_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/models/mt5-large/'
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_TOKENIZER_DIR)
tokenizer

PreTrainedTokenizer(name_or_path='/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/models/mt5-large/', vocab_size=250100, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [7]:
def preprocess_squad_batch(
    examples,
    question_column: str,
    context_column: str,
    answer_column: str,
) -> Tuple[List[str], List[str]]:
    questions = examples[question_column]
    contexts = examples[context_column]
    answers = examples[answer_column]

    def generate_input(_question, _context):
        return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])

    inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
    targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
    return inputs, targets

In [15]:
examples ={
    'question': ['What is the capital city of Thailand?', 'Who is the governor of Bangkok?'],
    'context': ['เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)', 'เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)'],
    'answers': [{'text': ['กรุงเทพมหานคร'] }, {'text': ['ชัชชาติ สิทธิพันธุ์'] }],
}
preprocess_squad_batch(examples=examples,  question_column='question',
    context_column='context',
    answer_column='answers')

(['question: What is the capital city of Thailand? context: เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)',
  'question: Who is the governor of Bangkok? context: เมืองหลวงของราชอาณาจักรไทย คือกรุงเทพมหานคร (Bangkok; BKK)'],
 ['กรุงเทพมหานคร', 'ชัชชาติ สิทธิพันธุ์'])

In [22]:
max_seq_length = 512
max_answer_length = 30
padding=True

def preprocess_function(examples, question_column, context_column, answer_column):
    inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)

    model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [28]:
model_ins = preprocess_function(examples=examples,
                    question_column='question',
                    context_column='context',
                    answer_column='answers')

model_ins.keys(), torch.LongTensor(model_ins['input_ids']).shape, torch.LongTensor(model_ins['input_ids'])

(dict_keys(['input_ids', 'attention_mask', 'labels']),
 torch.Size([2, 30]),
 tensor([[  7680,    267,   5126,    339,    287,   8646,   9416,    304,  25828,
             291,  19730,    267,    259,  24633, 108364,   1881,  47815,  44628,
          208385, 172171,  11984,  33555, 111284,    274, 220992,    296,    364,
           21058,    271,      1],
         [  7680,    267,  26104,    339,    287,  16780,    723,    304,  35878,
             291,  19730,    267,    259,  24633, 108364,   1881,  47815,  44628,
          208385, 172171,  11984,  33555, 111284,    274, 220992,    296,    364,
           21058,    271,      1]]))

In [33]:
# Post-processing:
def post_processing_function(
    examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
):
    # Decode the predicted tokens.
    preds = outputs.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
    predictions = {}
    # Let's loop over all the examples!
    for example_index, example in enumerate(examples):
        # This is the index of the feature associated to the current example.
        feature_index = feature_per_example[example_index]
        predictions[example["id"]] = decoded_preds[feature_index]

    # Format the result to the format the metric expects.
    if data_args.version_2_with_negative:
        formatted_predictions = [
            {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
        ]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)


In [None]:
  data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )


In [None]:

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


        
input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Das Haus ist wunderbar.