##Concatenating original dataset's training and validation splits

In [None]:
%pip install transformers==4.28.0 #Installing latest version is running into dependency problems
%pip install datasets
%pip install sentencepiece
%pip install rouge_score
%pip install nbformat
%pip install plotly
%pip install torch
%pip install evaluate
%pip install sklearn
%pip install numpy
%pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transfor

In [None]:
# Login to weights and biases
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Set WANDB project name env variable
%env WANDB_PROJECT=question2wrongAnswer
%env WANDB_LOG_MODEL='end'
%env WANDB_WATCH=all

env: WANDB_PROJECT=question2wrongAnswer
env: WANDB_LOG_MODEL='end'
env: WANDB_WATCH=all


In [None]:
from datasets import load_dataset,load_metric
# Load dataset from huggingface
dataset = load_dataset("exams", "crosslingual_with_para_pt")
dataset

In [None]:
from datasets import concatenate_datasets
# Concatenate the splits
concatenated_dataset = concatenate_datasets([dataset['train'], dataset['validation']])

## Split concatenated dataset into train, validation and test splits

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

# First split dataset into 'train' and 'validation and test' splits using question subject as parameter for stratification
folds = StratifiedKFold(n_splits=5)
concatenated_subject_list = [x['subject'] for x in concatenated_dataset['info']]
splits = folds.split(np.zeros(concatenated_dataset.num_rows), concatenated_subject_list)
for i, (train_index, test_index) in enumerate(splits):
  test_and_validation_splits = concatenated_dataset.select(test_index)
  dataset["train"] = concatenated_dataset.select(train_index)
# Now split 'validadtion and test' into 'validation' and 'test' splits
folds = StratifiedKFold(n_splits=2)
test_and_validadtion_subject_list = [x['subject'] for x in test_and_validation_splits['info']]
splits = folds.split(np.zeros(test_and_validation_splits.num_rows), test_and_validadtion_subject_list)
for i, (validadtion_index, test_index) in enumerate(splits):
  dataset["validation"] = test_and_validation_splits.select(validadtion_index)
  dataset["test"] = test_and_validation_splits.select(test_index)


# Train model with balanced dataset

### Preprocess data

In [None]:
# Set some variable
prefix = "question: "
max_input_length = 512
max_target_length = 32
batch_size = 8
num_train_epochs = 20
# formatacao do modelo mudar se precisar para um padrao
model_name = "question2answer"

In [None]:
from transformers import AutoTokenizer
# modelo base para o finetunning
checkpoint = 'Narrativa/mT5-base-finetuned-tydiQA-xqa'
# inicializa o tokenizer
tokenizer = AutoTokenizer.from_pretrained("Narrativa/mT5-base-finetuned-tydiQA-xqa")

Downloading (…)okenizer_config.json:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.33M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [None]:
def preprocess_examples(examples):
    """
    example: batch conjunto de exemplos
    Essa funcão é uma das mais importante já que formata nosso texto de entrada tokenizando o texto
    para esse finetunning usamos o dataset de um jeito diferente onde a entrada para o modelo sera uma string com tópico e questão
    e nossa saida esperada(outputs) serão alternativas.
    """
    questions = [x['stem'] for x in dataset['train']['question']]
    subjects = [x['subject'] for x in dataset['train']['info']]
    choices = [x['choices'] for x in dataset['train']['question']]
    correto = list()
    answerAsIndex = list()
    for answerKey in dataset['train']['answerKey']:
      if answerKey=='A':
        answerAsIndex.append(0)
      elif answerKey=='B':
        answerAsIndex.append(1)
      elif answerKey=='C':
        answerAsIndex.append(2)
      else:
        answerAsIndex.append(3)
      correto.append('Sim' if answerKey=='A' else 'Não')
      correto.append('Sim' if answerKey=='B' else 'Não')
      correto.append('Sim' if answerKey=='C' else 'Não')
      correto.append('Sim' if answerKey=='D' else 'Não')
    subjects = [x['subject'] for x in dataset['train']['info']]
    inputs = ["Tópico: {} Questão: {} Resposta: {} Contexto: {}".format(
      subjects[i//4],
      questions[i//4],
      choices[i//4]['text'][answerAsIndex[i//4]],
      choices[i//4]['para'][i%4]
      ) for i in range(4*len(questions)) if correto[i]=='Não']
    outputs = list()
    for index, choice in enumerate(choices):
        for i in range(4):
            if correto[4*index + i] == "Não":
                outputs.append(choice['text'][i])
    # inputs e outputs sao texto agora eles vai ser tokenizados lembrar que model_inputs tem diferentes campos ver tutorial 1
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)
    # o label ficara só com os tokens_id
    labels = tokenizer(outputs, max_length=max_target_length, padding="max_length", truncation=True).input_ids
    labels_with_ignore_index = []
    for labels_example in labels:
        # devido ao processo de padding é importante não considerar esses ids
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs

In [None]:
# separa o dataset
train,val,test = dataset['train'],dataset['validation'],dataset['test']
# Preprocessa o dataset
encoded_train_ds = train.map(preprocess_examples, batched=True, remove_columns=train.column_names)
encoded_train_ds.set_format(type="torch")
encoded_val_ds = val.map(preprocess_examples, batched=True, remove_columns=val.column_names)
encoded_val_ds.set_format(type="torch")

Map:   0%|          | 0/740 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader
def create_dataloaders(train_batch_size=8, eval_batch_size=32):
    train_dataloader = DataLoader(encoded_train_ds, shuffle=True, batch_size=train_batch_size)
    val_dataloader = DataLoader(encoded_val_ds, shuffle=False, batch_size=eval_batch_size)

    return train_dataloader, val_dataloader

In [None]:
from transformers import EarlyStoppingCallback
from transformers import T5Model, MT5ForConditionalGeneration
import torch

# Carrega o modelo que será usado para o finetunning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MT5ForConditionalGeneration.from_pretrained("Narrativa/mT5-base-finetuned-tydiQA-xqa").to(device)
# Early stopping e uma tecnica onde o finetuning para quando ele nao melhorar depois de 3 vezes consecutivas
early = EarlyStoppingCallback(early_stopping_patience=3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

# Inicializa data_collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments

# Show the training loss with every epoch
logging_steps = len(encoded_train_ds) // batch_size

# Instancia classe Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    # armazena só um modelo no colab é imporante não usar muita memoria
    save_total_limit=1,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    # garante que o modelo armazene o melhor modelo
    load_best_model_at_end=True,
    save_strategy="epoch",
    fp16=True,
    report_to="wandb",
    run_name='dataset-balancing-+-correct-answer-in-input-+-multilingual'
)

### Set variables and methods related to metrics computation

In [None]:
import nltk
import evaluate
# essa biblioteca é importante para a funcao compute metrics
nltk.download('punkt')

rouge = evaluate.load('rouge', module_type="metric")
bleu = evaluate.load('bleu', module_type="metric")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
import nltk
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    rouge_decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    rouge_decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Some simple post-processing (for BLEU)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Calculo das métricas
    result = rouge.compute(predictions=rouge_decoded_preds, references=rouge_decoded_labels, use_stemmer=True)
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result.update({"bleu": bleu_result["bleu"]})

    result = {k: round(v*100, 2) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return result

### Finetune the model

In [None]:
from transformers import Seq2SeqTrainer
# Criamos o Objeto trainer que vai receber os argumentos anterios juntos com as entradas numericas
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=encoded_train_ds,
    eval_dataset=encoded_val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early]
)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mhirokibastos[0m ([33mquizzing[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Gen Len
1,0.0,,19.25,6.29,17.39,17.39,5.27,15.963964
2,0.0,,19.25,6.29,17.39,17.39,5.27,15.963964
3,0.0,,19.25,6.29,17.39,17.39,5.27,15.963964
4,0.0,,19.25,6.29,17.39,17.39,5.27,15.963964




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/bleu,▁▁▁▁
eval/gen_len,▁▁▁▁
eval/rouge1,▁▁▁▁
eval/rouge2,▁▁▁▁
eval/rougeL,▁▁▁▁
eval/rougeLsum,▁▁▁▁
eval/runtime,▁█▇█
eval/samples_per_second,█▁▂▁
eval/steps_per_second,█▂▂▁
train/epoch,▁▁▃▃▆▆███

0,1
eval/bleu,5.27
eval/gen_len,15.96396
eval/loss,
eval/rouge1,19.25
eval/rouge2,6.29
eval/rougeL,17.39
eval/rougeLsum,17.39
eval/runtime,565.7732
eval/samples_per_second,3.924
eval/steps_per_second,0.491


# Save finetunned model to gdrive

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [None]:
import torch
model_save_name = 'wrong_answers_datasetbalancing_correctanswerinput_multilingual.pt'
path = F"/content/gdrive/MyDrive/Models/{model_save_name}"
torch.save(model.state_dict(), path)

Referencia:

- Metrics
  https://huggingface.co/spaces/evaluate-metric/bleu

  https://huggingface.co/docs/evaluate/transformers_integrations

  https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py

  https://huggingface.co/spaces/evaluate-metric/bleu

- Balancing dataset
  https://discuss.huggingface.co/t/k-fold-cross-validation/5765/5

  https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html