In [None]:
%pip install google.colab

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%pip install transformers
%pip install datasets
%pip install sentencepiece
%pip install rouge_score
%pip install nbformat
%pip install plotly
%pip install torch

In [None]:
# funcao para fazer um print para mostra o texto em diferentes linhas
import textwrap
def print_lines(texto,w=50):
    lineas = textwrap.wrap(texto, width=w)
    for linea in lineas:
        print(linea)

In [None]:
from datasets import load_dataset,load_metric
# Load dataset from huggingface
dataset = load_dataset("exams", "crosslingual_with_para_pt")
dataset

In [None]:
# Manipulate data set to build inputs
questions = [x['stem'] for x in dataset['train']['question']]
subjects = [x['subject'] for x in dataset['train']['info']]
choices = [x['choices'] for x in dataset['train']['question']]
correto = list()
for answerKey in dataset['train']['answerKey']:
    correto.append('Sim' if answerKey=='A' else 'Não')
    correto.append('Sim' if answerKey=='B' else 'Não')
    correto.append('Sim' if answerKey=='C' else 'Não')
    correto.append('Sim' if answerKey=='D' else 'Não')
subjects = [x['subject'] for x in dataset['train']['info']]
inputs = ["Tópico: {} Questão: {} Context: {}".format(subjects[i//4], questions[i//4], choices[i//4]['para'][i%4]) for i in range(4*len(questions)) if correto[i]=='Não']

In [None]:
# Manipulate dataset to build expected outputs
choices = [x['choices'] for x in dataset['train']['question']]
outputs = list()
for index, choice in enumerate(choices):
    for i in range(4):
        if correto[4*index + i] == "Não":
            outputs.append(choice['text'][i])

In [None]:
outputs

In [None]:
prefix = "question: "
max_input_length = 512
max_target_length = 32

In [None]:
def preprocess_examples(examples):
    """
    example: batch conjunto de exemplos
    Essa funcão é uma das mais importante já que formata nosso texto de entrada tokenizando o texto
    para esse finetunning usamos o dataset de um jeito diferente onde a entrada para o modelo sera uma string com tópico e questão
    e nossa saida esperada(outputs) serão alternativas.
    """
    questions = [x['stem'] for x in dataset['train']['question']]
    subjects = [x['subject'] for x in dataset['train']['info']]
    choices = [x['choices'] for x in dataset['train']['question']]
    correto = list()
    for answerKey in dataset['train']['answerKey']:
        correto.append('Sim' if answerKey=='A' else 'Não')
        correto.append('Sim' if answerKey=='B' else 'Não')
        correto.append('Sim' if answerKey=='C' else 'Não')
        correto.append('Sim' if answerKey=='D' else 'Não')
    subjects = [x['subject'] for x in dataset['train']['info']]
    inputs = ["Tópico: {} Questão: {} Context: {}".format(subjects[i//4], questions[i//4], choices[i//4]['para'][i%4]) for i in range(4*len(questions)) if correto[i]=='Não']
    outputs = list()
    for index, choice in enumerate(choices):
        for i in range(4):
            if correto[4*index + i] == "Não":
                outputs.append(choice['text'][i])
    # inputs e outputs sao texto agora eles vai ser tokenizados lembrar que model_inputs tem diferentes campos ver tutorial 1
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)
    # o label ficara só com os tokens_id
    labels = tokenizer(outputs, max_length=max_target_length, padding="max_length", truncation=True).input_ids
    labels_with_ignore_index = []
    for labels_example in labels:
        # devido ao processo de padding é importante não considerar esses ids
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs

In [None]:
from torch.utils.data import DataLoader
def create_dataloaders(train_batch_size=8, eval_batch_size=32):
    train_dataloader = DataLoader(encoded_train_ds, shuffle=True, batch_size=train_batch_size)
    val_dataloader = DataLoader(encoded_val_ds, shuffle=False, batch_size=eval_batch_size)
    
    return train_dataloader, val_dataloader

In [None]:
from transformers import T5Tokenizer
# modelo base para o finetunning
checkpoint = 'unicamp-dl/ptt5-base-portuguese-vocab'
# inicializa o tokenizer
tokenizer = T5Tokenizer.from_pretrained(checkpoint,model_max_length=max_input_length)

In [None]:
# separa o dataset
train,val = dataset['train'],dataset['validation']

In [None]:
encoded_train_ds = train.map(preprocess_examples, batched=True, remove_columns=train.column_names)
encoded_val_ds = val.map(preprocess_examples, batched=True, remove_columns=val.column_names)

In [None]:
encoded_train_ds.set_format(type="torch")
encoded_val_ds.set_format(type="torch")

In [None]:
from transformers import EarlyStoppingCallback
from transformers import T5Model, T5ForConditionalGeneration

# PyTorch
model_pt = T5ForConditionalGeneration.from_pretrained(checkpoint)
# early stopping e uma tecnica onde o finetuning para quando ele nao melhorar depois de 3 vezes consecutivas
early = EarlyStoppingCallback(early_stopping_patience=3)

In [None]:
import nltk
# essa biblioteca e importante para a funcao compute metrics
nltk.download('punkt')
metric = load_metric("rouge")

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    # tem a predicoes e os labels(o que é esperado)
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence 
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # calculo das metricas
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pt)

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 20
# Show the training loss with every epoch
logging_steps = len(encoded_train_ds) // batch_size
# formatacao do modelo mudar se precisar para um padrao
model_name = "question2answer"
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    # armazena só um modelo no colab é imporante não usar muita memoria
    save_total_limit=1,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    # garante que o modelo armazene o melhor modelo
    load_best_model_at_end=True,
    save_strategy="epoch",
    fp16=True
)

In [None]:
from transformers import Seq2SeqTrainer
# Criamos o Objeto trainer que vai receber os argumentos anterios juntos com as entradas numericas
trainer = Seq2SeqTrainer(
    model_pt,
    args,
    train_dataset=encoded_train_ds,
    eval_dataset=encoded_val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early]
    
)

In [None]:
trainer.train()

In [None]:
model_pt.to("cpu")
model_pt.save_pretrained("temporal"+"_best/")

In [None]:
val

In [None]:
val['question'][100]['choices']['para'][0]

In [None]:
val['question'][100]['stem']

In [None]:
val['question'][100]['choices']['text'][0]

In [None]:
query = ["Tópico: {} Questão: {} Context: {}".format(val['info'][100]['subject'], val['question'][100]['stem'], val['question'][100]['choices']['para'][0])]

In [None]:
input = tokenizer(query, return_tensors="pt",padding=True,truncation=True).input_ids

In [None]:
output = model_pt.generate(input,max_new_tokens=max_target_length,num_beams=10,num_return_sequences=10)

In [None]:
alternativas = tokenizer.batch_decode(output, skip_special_tokens=True)

In [None]:
for x in alternativas:
  print(x)