In [None]:
%pip install transformers==4.28.0 #Installing latest version is running into dependency problems
%pip install datasets
%pip install sentencepiece
%pip install nbformat
%pip install plotly
%pip install torch
%pip install evaluate
%pip install sklearn
%pip install numpy

# Testing finetuned model models

### Define suport class for data preprocessing

In [1]:
from torch.utils.data import Dataset
class TestDataset(Dataset):
    """
    Esta classe nos ajuda a no processamento por batches quando temos datasets testing muito grandes,
    isso ajudar a acelerar a inferencia.
    """
    def __init__(self, dataset):
        questions = [x['stem'] for x in dataset['question']]
        subjects = [x['subject'] for x in dataset['info']]
        choices = [x['choices'] for x in dataset['question']]
        correto = list()
        answerAsIndex = list()
        for answerKey in dataset['answerKey']:
            if answerKey=='A':
                answerAsIndex.append(0)
            elif answerKey=='B':
                answerAsIndex.append(1)
            elif answerKey=='C':
                answerAsIndex.append(2)
            else:
                answerAsIndex.append(3)
            correto.append('Sim' if answerKey=='A' else 'Não')
            correto.append('Sim' if answerKey=='B' else 'Não')
            correto.append('Sim' if answerKey=='C' else 'Não')
            correto.append('Sim' if answerKey=='D' else 'Não')
        subjects = [x['subject'] for x in dataset['info']]
        inputs = ["Tópico: {} Questão: {} Resposta: {} Contexto: {}".format(
            subjects[i//4],
            questions[i//4],
            choices[i//4]['text'][answerAsIndex[i//4]],
            choices[i//4]['para'][i%4]
            ) for i in range(4*len(questions)) if correto[i]=='Não']
        outputs = list()
        for index, choice in enumerate(choices):
            for i in range(4):
                if correto[4*index + i] == "Não":
                    outputs.append(choice['text'][i])

        self.data = dataset
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index],self.outputs[index]

### Load dataset

In [2]:
from datasets import load_dataset
from datasets import concatenate_datasets
import numpy as np
from sklearn.model_selection import StratifiedKFold

# Load dataset from huggingface
data_raw = load_dataset("exams", "crosslingual_with_para_pt")

# Concatenate the splits
concatenated_dataset = concatenate_datasets([data_raw['train'], data_raw['validation']])

# First split dataset into 'train' and 'validation and test' splits using question subject as parameter for stratification
folds = StratifiedKFold(n_splits=5)
concatenated_subject_list = [x['subject'] for x in concatenated_dataset['info']]
splits = folds.split(np.zeros(concatenated_dataset.num_rows), concatenated_subject_list)
for i, (train_index, test_index) in enumerate(splits):
  test_and_validation_splits = concatenated_dataset.select(test_index)
  data_raw["train"] = concatenated_dataset.select(train_index)
# Now split 'validadtion and test' into 'validation' and 'test' splits
folds = StratifiedKFold(n_splits=2)
test_and_validadtion_subject_list = [x['subject'] for x in test_and_validation_splits['info']]
splits = folds.split(np.zeros(test_and_validation_splits.num_rows), test_and_validadtion_subject_list)
for i, (validadtion_index, test_index) in enumerate(splits):
  data_raw["validation"] = test_and_validation_splits.select(validadtion_index)
  data_raw["test"] = test_and_validation_splits.select(test_index)

dataset = data_raw['test'].to_pandas()


Found cached dataset exams (/home/hiroki/.cache/huggingface/datasets/exams/crosslingual_with_para_pt/1.0.0/4330a7899d757352941782397b09a4603093ef04e23873967042f5cee03cadd8)


  0%|          | 0/2 [00:00<?, ?it/s]

### Define metric computation functions

In [None]:
# from fast_bleu import BLEU
# from nltk.tokenize import RegexpTokenizer
# def bleu(original, generated):
#     tokenizer= RegexpTokenizer(r'\w+')
#     original = original.lower()
#     generated = generated.lower()
#     sentence1 = tokenizer.tokenize(original)
#     sentence2 = tokenizer.tokenize(generated)
#     list_of_references = [sentence1]
#     hypotheses = [sentence2]
#     weights = {'bigram': (1/2., 1/2.)}
#     bleu = BLEU(list_of_references, weights)
#     return round(bleu.get_score(hypotheses)['bigram'][0],6)

In [3]:
import evaluate
def bleu(references, predictions):
    bleu = evaluate.load('bleu')
    if isinstance(references, str):
        references = [references.lower()]
    if isinstance(predictions, str):
        predictions = [predictions.lower()]
    results = bleu.compute(predictions=predictions, references=references)
    return results['bleu']

In [4]:
import evaluate
def rouge(references, predictions):
    rouge = evaluate.load('rouge')
    if isinstance(references, str):
        references = [references.lower()]
    if isinstance(predictions, str):
        predictions = [predictions.lower()]
    results = rouge.compute(predictions=predictions, references=references)
    return results['rouge1'], results['rouge2'], results['rougeL']

### Define testing function

In [24]:
from torch.utils.data import DataLoader
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
def test_generations(model,dataset):
    """
    Essa funcao usa dataset em formato pandas
    model: modelo da huggingface (pipeline)
    dataset: split de teste do dataset no formato pandas
    output_name: nome do arquivo para salvar
    """
    #data = pd.read_csv(dataset)
    test = TestDataset(dataset)
    ##################################################
    # batch_size pode dar problemas se for muito grande reduzir
    ##################################################
    dataloader = DataLoader(test, batch_size=64, shuffle=False)
    total_result = defaultdict(list)
    for batch in tqdm(dataloader):
        entrada,saida_esperada = batch
        saida_predita = model.predict([x for x in entrada])
        saida_predita = list(map(lambda x: x['generated_text'], saida_predita))
        saida_esperada = list(saida_esperada)
        total_result['saida_esperada'] += saida_esperada
        total_result['saida_predita'] += saida_predita
        # Compute metrics
        total_result['bleu'] += [bleu(x,y) for x,y in zip(saida_esperada, saida_predita)]
        rouge1, rouge2, rougeL = zip(*[rouge(x,y) for x,y in zip(saida_esperada, saida_predita)])
        total_result['rouge1'] += rouge1
        total_result['rouge2'] += rouge2
        total_result['rougeL'] += rougeL
    return pd.DataFrame(total_result)

### Load Model and create pipeline

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [None]:
# Use this only if running on Google Colab
# Only .pt file was saved during model fine tunning, thus we must load and re-save it to get config files
from transformers import T5Model, T5ForConditionalGeneration
from transformers import T5Tokenizer
import torch

# Set this parameters for each testing session
model_save_name = 'wrong_answers_v5.pt'
new_folder = 'DatasetBalance_CorrectInInput'

# Load base model
model_pt = T5ForConditionalGeneration.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab')

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab', model_max_length=512)

# Load model checkpoint (our pre-trained model)
path = F'../models/{model_save_name}'
model_pt.load_state_dict(state_dict=torch.load(path, map_location=torch.device('cpu')))

# Save it with its config files to a new path
model_pt.save_pretrained(F'../models/WrongAnswers/{new_folder}')

In [20]:
from transformers import pipeline
# source: name of model in huggingface public repository or model directory
# model_save_name = 'wrong_answers_v5.pt'
# source_model = F'/content/gdrive/My Drive/Models/WrongAnswer/{new_folder}' # Gdrive directory
source_model = '../models/WrongAnswers/{new_folder}' #Local directory
source_tokenizer = 'unicamp-dl/ptt5-base-portuguese-vocab'
model = pipeline("text2text-generation",
                 model=source_model,
                 tokenizer=source_tokenizer,
                 device=-1)# >=0 GPU e -1 CPU (default)
                # A diferenca de nao usar o GPU pode ser mais de dois vezes o tempo GPU


### Run Tests

In [26]:
result_exams = test_generations(model=model,dataset=dataset)

100%|██████████| 5/5 [21:36<00:00, 259.25s/it]


In [27]:
result_exams.to_csv('test_results/wrong_answers/dataset_balancing_+_correct_answer_in_input.csv',index=False)

### Report Results

In [28]:
result_exams.head()

Unnamed: 0,saida_esperada,saida_predita,bleu,rouge1,rouge2,rougeL
0,"NE-SO, em que a compactação dos materiais cont...","NE-SE, em que a resistência dos materiais cont...",0.547586,0.823529,0.6875,0.764706
1,"NO-SE, em que a erosão dos materiais contribui...","NO-SE, em que a erosão contribuiu para a forma...",0.436499,0.8,0.642857,0.733333
2,"NE-SO, em que o afundimento dos materiais cont...","NE-SE, em que a resistência dos materiais cont...",0.616261,0.774194,0.62069,0.774194
3,um argumento a favor do catastrofismo,uma declaração de amor à vida,0.0,0.0,0.0,0.0
4,uma evidência de um episódio de orogenia,uma evidência estrutural da colisão de placas ...,0.0,0.4,0.222222,0.4


In [29]:
# Show metrics
result_exams.describe()

Unnamed: 0,bleu,rouge1,rouge2,rougeL
count,276.0,276.0,276.0,276.0
mean,0.121412,0.47983,0.28742,0.451213
std,0.238848,0.288522,0.308156,0.287471
min,0.0,0.0,0.0,0.0
25%,0.0,0.285714,0.0,0.25
50%,0.0,0.5,0.2,0.416199
75%,0.0,0.666667,0.5,0.666667
max,1.0,1.0,1.0,1.0
