# Entrenamiento de modelos tipo QA para preguntas en español contextos en otros idiomas.

In [None]:
!pip install transformers -q
!pip install huggingface_hub -q

[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 47.2 MB/s 
[K     |████████████████████████████████| 182 kB 59.3 MB/s 
[?25h

In [None]:
import torch
from torch.utils.data import DataLoader

from transformers import AdamW

from pathlib import Path
from urllib.request import urlopen
import json

Aquí se realiza el inicio de sesión en Hugging Face, **NO** ejecutar esta celda si no se cuenta con una cuenta creada:

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Cargue de los datos

In [None]:
# Tomado de https://github.com/deepmind/xquad
base_url='https://raw.githubusercontent.com/deepmind/xquad/master/{}'

dataset={
    'Arabic'    : 'xquad.ar.json',
    'German'    : 'xquad.de.json',
    'Greek'     : 'xquad.el.json',
    'English'   : 'xquad.en.json',
    'Spanish'   : 'xquad.es.json',
    'Hindi'     : 'xquad.hi.json',
    'Russian'   : 'xquad.ru.json',
    'Thai'      : 'xquad.th.json',
    'Turkish'   : 'xquad.tr.json',
    'Vietnamese': 'xquad.vi.json',
    'Chinese'   : 'xquad.zh.json',
    'Romanian'  : 'xquad.ro.json'
    }

def get_XQuAD_url(language):
  return base_url.format(dataset[language])

Funciones adpatadas de https://huggingface.co/transformers/v3.2.0/custom_datasets.html


In [None]:
def read_squad(path, is_url=True):
  if is_url:
    response = urlopen(path)
    squad_dict = json.loads(response.read())
  else:
    path = Path(path)
    squad_dict = json.load(path)
  
  es_squad_dict = json.loads(urlopen('https://raw.githubusercontent.com/deepmind/xquad/master/xquad.es.json').read())
  data=[]
  
  for group, group_es in zip(squad_dict['data'], es_squad_dict['data']):
    for passage, passage_es in zip(group['paragraphs'], group_es['paragraphs']):
      context = passage['context']
      context_es = passage_es['context']
      for qa, qa_es in zip(passage['qas'], passage_es['qas']):
        question = qa['question']
        question_es = qa_es['question']
        for answer, answer_es in zip(qa['answers'], qa_es['answers']):
          data.append((context, question_es, answer))
    
  return data

In [None]:
from sklearn.model_selection import train_test_split

def split_squad(data, train_size=10):
  data_train, data_test = train_test_split(data,train_size=train_size, random_state=13)
  train_contexts = [p[0] for p in data_train]
  train_questions = [p[1] for p in data_train]
  train_answers = [p[2] for p in data_train]
  test_contexts = [p[0] for p in data_test]
  test_questions = [p[1] for p in data_test]
  test_answers = [p[2] for p in data_test]
  return (train_contexts, train_questions, train_answers), (test_contexts, test_questions, test_answers)

train_contexts, train_questions, train_answers = [], [], []
test_contexts, test_questions, test_answers = [], [], []

for language in dataset.keys(): 
  l_data = read_squad(get_XQuAD_url(language=language))
  train_data, test_data = split_squad(l_data, train_size=10)
  train_contexts += train_data[0]
  train_questions += train_data[1]
  train_answers += train_data[2]
  test_contexts += test_data[0]
  test_questions += test_data[1]
  test_answers += test_data[2]

# Preprocesamiento de los datos

En el dataset XQUAD puede haber un desfase de 1 o -1 entre el índice indicado en la respuesta y su posición real sobre el párrafo. La siguiente función permite realizar la corrección de este error.


In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

add_end_idx(train_answers, train_contexts)
add_end_idx(test_answers, test_contexts)

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('mrm8488/bert-multi-cased-finetuned-xquadv1')

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


In [None]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [None]:
# Esta función permite reemplazar las posiciones None con el valor de índice máximo posible del tokenizer 
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(test_encodings, test_answers)

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)

# Fine tuning

In [None]:
from transformers import DistilBertForQuestionAnswering, BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-multi-cased-finetuned-xquadv1')

In [None]:
# Función para visualzar el progreso del entrenamiento por batches
def progress(idx, module=50, middle='.', end='\n'):
  line= middle if min(1, idx % module) else str(idx)+end
  print(end=line)

La siguiente rutina permite realizar el entrenamiento del modelo base `mrm8488/bert-multi-cased-finetuned-xquadv1` usando *n* muestras por idioma del dataset XQuAD. Este pipeline se usará con n = 1, 2, 5, 10, 20, 25. El modelo resultante se almacenará en HuggingFace.

In [None]:
def pipeline(train_size):

  with torch.no_grad():
    torch.cuda.empty_cache()

  train_contexts, train_questions, train_answers = [], [], []
  test_contexts, test_questions, test_answers = [], [], []

  for language in dataset.keys(): 
    l_data = read_squad(get_XQuAD_url(language=language))
    train_data, _ = split_squad(l_data, train_size=train_size)
    train_contexts += train_data[0]
    train_questions += train_data[1]
    train_answers += train_data[2]

    add_end_idx(train_answers, train_contexts)
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    add_token_positions(train_encodings, train_answers)
    train_dataset = SquadDataset(train_encodings)
  
  model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-multi-cased-finetuned-xquadv1')

  model.to(device)
  model.train()

  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

  optim = AdamW(model.parameters(), lr=5e-5)

  losses=[]
  for epoch in range(1):
    for idx, batch in enumerate(train_loader):
      progress(idx+1)
      optim.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      losses.append(loss)

      loss.backward()
      optim.step()
  
  with open("losses_{}samplesByLanguage_.txt".format(train_size), "w") as text_file:
      text_file.write(str(losses))
  model.eval()

  # Comentar estas 2 líneas si NO se cuenta con una cuenta en Hugging Face
  model.push_to_hub("LeoAngel/bert-finetuned-crossxquadv1_{}sbl".format(train_size))
  tokenizer.push_to_hub("LeoAngel/bert-finetuned-crossxquadv1_{}sbl".format(train_size))

In [None]:
for train_size in [1,2,5,10,20,25]:
  with torch.no_grad():
    torch.cuda.empty_cache()
  pipeline(train_size)