# **Perguntas e Respostas FAQUAD**  


In [None]:
import requests
import json
import torch
import os
from tqdm import tqdm

In [None]:

!pip install transformers

### **Download FAQUAD**

In [None]:
!wget -nc https://raw.githubusercontent.com/liafacom/faquad/master/data/train.json
!wget -nc https://raw.githubusercontent.com/liafacom/faquad/master/data/dev.json

In [None]:
with open('train.json', 'rb') as f:
  faquad = json.load(f)

In [None]:
# Titulos e paragrafos
faquad['data'][0].keys()

In [None]:
# Titulos dos conteudos do faquad
for group in faquad['data']:
    print(group['title'])

### **Contexto, perguntas e respostas**

In [None]:
def read_data(path):
  with open(path, 'rb') as f:
    faquad = json.load(f)

  contexts = []
  questions = []
  answers = []

  for group in faquad['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        for answer in qa['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  return contexts, questions, answers

Separação dos dados para treino e validação

In [None]:
train_contexts, train_questions, train_answers = read_data('train.json')
valid_contexts, valid_questions, valid_answers = read_data('dev.json')

In [None]:
print(f'Há {len(train_questions)} perguntas')
print(train_questions[-1])
print(train_answers[-1])

Na base de dados há apenas onde começam as respostas, é necessario descobrir onde elas terminam.

In [None]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2
    else:
        start_idx_f = context.find(answer['text'])
        end_idx_f = start_idx_f + len(gold_text)
        if context[start_idx_f:end_idx_f]:
            answer['answer_start'] = start_idx_f
            answer['answer_end'] = end_idx_f
        else:
            answer['answer_start'] = start_idx
            answer['answer_end'] = end_idx

add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)

In [None]:
# Agora com o index de onde as respostas terminam podemos seguir para a tokenização
print(train_questions[720])
print(train_answers[720])
print(train_contexts[720])

In [None]:
for answer, context in zip(train_answers, train_contexts):
    start = answer['answer_start']
    end = answer['answer_end']
    if context[start] == " ":
        answer['answer_start'] = start+1
        answer['text'] = answer['text'][1:]
        # print(answer['text'][1:])
        # print(context[start+1:end])

In [None]:
for answer, context in zip(valid_answers, valid_contexts):
    start = answer['answer_start']
    end = answer['answer_end']
    if answer["text"][-1] == " ":
        # print(answer)
        # print(context[start:end-1])
        # print(answer["text"].rstrip())
        answer['answer_end'] = end-1
        answer['text'] = answer["text"].rstrip()
    try:
        if answer["text"]!=context[start:end]:
            answer["text"] = context[start:end]
    except:
        print("$$$$$")
        # print(context[start:])

### **Tokenization 🔢**

Usamos o `BertTokenizerFast` por ser mais rapido para gerar os tokens. O `padding=True` completa as sentenças com menos de 512 tokens com um marcador [PAD] e o `truncation=True` reduz o tamanho da sentença quando for maior que 512.

In [None]:
from transformers import BertTokenizerFast, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True, max_length=512)

Abaixo há um exemplo de setença que foi usado o padding.

In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

A próxima etapa é converter nossas posições de início/fim de personagem em posições de início/fim de token. Por que fazer isso? Porque nossas palavras são convertidas em tokens, então o início/fim da resposta precisa mostrar o índice do token de início/fim que contém a resposta e não os caracteres específicos no contexto.

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

### **Preparação dos dados para treinamento**

In [None]:
class FAQuaD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
len(train_encodings['input_ids'])

In [None]:
# for i in range(len(valid_encodings['input_ids'])):
#     try:
#         teste = {key: torch.tensor(val[i]) for key, val in valid_encodings.items()}
#     except:
#         print(i)

In [None]:
train_dataset = FAQuaD_Dataset(train_encodings)
valid_dataset = FAQuaD_Dataset(valid_encodings)

### **Dataloaders**

In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8)

## **Fine-Tuning**

In [None]:
from transformers import BertForQuestionAnswering, AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("neuralmind/bert-base-portuguese-cased")

### **Training 🏋️‍♂️**

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'{device}')

In [None]:
from transformers import AdamW

N_EPOCHS = 20
optim = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

**Salvar modelo**

In [None]:
model_path = 'BERT-FaQuAD/'
os.makedirs(model_path, exist_ok=True)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

**Carregar o Modelo**

In [None]:
#from transformers import BertForQuestionAnswering, BertTokenizerFast

#model_path = '/content/drive/MyDrive/BERT-SQuAD'
#model = BertForQuestionAnswering.from_pretrained(model_path)
#tokenizer = BertTokenizerFast.from_pretrained(model_path)

#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#print(f'Working on {device}')

#model = model.to(device)

### **Teste**

In [None]:
model.eval()

acc = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

acc = sum(acc)/len(acc)
print(acc)
# print("\n\nT/P\tanswer_start\tanswer_end\n")
# for i in range(len(start_true)):
#   print(f"true\t{start_true[i]}\t{end_true[i]}\n"
#         f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

### **Fazer as perguntas**

In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
  outputs = model(**inputs)

  answer_start = torch.argmax(outputs[0])
  answer_end = torch.argmax(outputs[1]) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(um|uma|o)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return round(2 * (prec * rec) / (prec + rec), 2)

def question_answer(context, question, answer):
  prediction = get_prediction(context,question)
  em_score = exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)
  return em_score, f1_score

#   print(f'Question: {question}')
#   print(f'Prediction: {prediction}')
#   print(f'True Answer: {answer}')
#   print(f'Exact match: {em_score}')
#   print(f'F1 score: {f1_score}\n')

In [None]:
answers = [i['text'] for i in valid_answers]

In [None]:
import numpy as np

em_score_results = []
f1_score_results = []
for context, question, answer in zip(valid_contexts, valid_questions, answers):
  em_score, f1_score = question_answer(context, question, answer)
  em_score_results.append(em_score)
  f1_score_results.append(f1_score)

print(f"Exact match: {np.asarray(em_score_results).mean()}")
print(f"F1 score: {np.asarray(f1_score_results).mean()}")