---
## 💾 Drive
---

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
BASE_FOLDER = '/content/drive/MyDrive/TFM/'
# RACE
RACE_DIR = '/content/drive/MyDrive/TFM/RACE_DATASET/output_dir/RACE'
# EE
EE_EN_PATH = BASE_FOLDER + 'EntranceExam/qa2015-exam-readingENGLISH.csv'
EE_ES_PATH = BASE_FOLDER + 'EntranceExam/qa2015-exam-readingSPANISH.csv'

---
# 🧹 Clean Caches
---

## Check GPU & RAM

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Flush Memory

In [None]:
!rm -rf /root/.cache/huggingface

In [None]:
import torch

torch.cuda.empty_cache() 

---
# 🔥 Pytorch Lightning
---

## Imports

In [None]:
!pip install pytorch-lightning
!pip install torchtext

In [None]:
!pip install lineflow transformers

In [None]:
from typing import Dict
from pathlib import Path

import json
import pandas as pd

from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
import transformers
from transformers import DistilBertForMultipleChoice, DistilBertTokenizerFast, AdamW
transformers.logging.set_verbosity_error()

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler


## RACE

### Download models etc

#### Download Original File
https://www.cs.cmu.edu/~glai1/data/race/


### Download from datasets library
> from datasets import load_dataset

> race_all = load_dataset("race", "all")

### Extract from drive to colab root 
!tar -xvf /content/drive/MyDrive/TFM/RACE_DATASET/RACE.tar.gz 

### Extract from drive to drive directory

!tar -xvf /content/drive/MyDrive/TFM/RACE_DATASET/RACE.tar.gz -C /content/drive/MyDrive/TFM/RACE_DATASET/output_dir

In [None]:
#!tar -xvf /content/drive/MyDrive/TFM/RACE_DATASET/RACE.tar.gz -C ./RACE

## Constants

In [None]:
MODEL_ID = "distilbert-base-uncased"
MODEL_TRF = DistilBertForMultipleChoice
MODEL_TKN = DistilBertTokenizerFast

### Race loaders

In [None]:
MAX_LEN = 10000
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
# RACE 
RACE_DIR = '/content/drive/MyDrive/TFM/RACE_DATASET/output_dir/RACE'

In [None]:
import os
from pathlib import Path
from os.path import exists

def raw_samples_to_dataset(samples, size=100):
    datas = []
    for sample in samples:
        for idx in range(len(sample["answers"])):
            _id = sample["id"] if "id" in sample else sample["example_id"]
            _article = sample["article"]
            _answer = sample["answers"][idx]
            _options = sample["options"][idx]
            _question = sample["questions"][idx]

            data = {
              "id": _id,
              "article": _article,
              "answer": _answer,
              "options": _options,
              "question": _question,
            }
            datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: MODEL_TKN, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        if x["question"].find("_") != -1:
            text_b = x["question"].replace("_", option)
        else:
            text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids = inputs["input_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask
        })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
      "id": x["id"] if "id" in x else x["example_id"],
      "label": label,
      "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
      "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
    }


def get_dataloader_race(datadir=None, cachedir: str = "/content/drive/MyDrive/TFM/RACE_DATASET/race_cache"):

    if datadir is None:
      datadir = "./RACE"

    datadir = Path(datadir)
    cachedir = Path(cachedir)
    batch_size = 8

    tokenizer = MODEL_TKN.from_pretrained(MODEL_ID, do_lower_case=True)
    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "train" / grade).iterdir():
            train_samples.append(json.loads(_path.read_text()))
    train = raw_samples_to_dataset(train_samples)
    train_dataloader = DataLoader(
      train.map(preprocessor).save(cachedir / "train.cache"),
      sampler=RandomSampler(train),
      batch_size=batch_size
    )

    val_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "dev" / grade).iterdir():
            val_samples.append(json.loads(_path.read_text()))

    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
      val.map(preprocessor).save(cachedir / "val.cache"),
      sampler=SequentialSampler(val),
      batch_size=batch_size
    )

    test_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "test" / grade).iterdir():
            test_samples.append(json.loads(_path.read_text()))
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
      test.map(preprocessor).save(cachedir / "test.cache"),
      sampler=SequentialSampler(test),
      batch_size=batch_size
    )

    return train_dataloader, val_dataloader, test_dataloader


## EE

In [None]:
def preprocess_ee(tokenizer: MODEL_TKN, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids = inputs["input_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask
        })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
      "id": x["id"],
      "label": label,
      "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
      "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
    }

def raw_samples_to_dataset_ee(samples, size=100):
    datas = []
    for sample in samples:
      if "answers" in sample:
        for idx in range(len(sample["answers"])):
            _id = sample["id"] if "id" in sample else sample["example_id"]
            _article = sample["article"]
            _answer = sample["answers"][idx]
            _options = sample["options"][idx]
            _question = sample["questions"][idx]

            data = {
              "id": _id,
              "article": _article,
              "answer": _answer,
              "options": _options,
              "question": _question,
            }
            datas.append(data)
      else:
        datas.append(sample) # take into account autogenerated docs
            
    return datas

def get_dataloader_ee(train_val_dir, test_dirs=None, extensions=None, cachedir: str = "/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en_ext"):
    if test_dirs is None:
      test_dirs = [
        '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2013.json', 
        '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2014.json',
      ]

    train_val_dir = Path(train_val_dir)
    cachedir = Path(cachedir)
    batch_size = 8

    tokenizer = MODEL_TKN.from_pretrained(MODEL_ID, do_lower_case=True)
    preprocessor = partial(preprocess_ee, tokenizer)

    train_val_samples = pd.read_json(train_val_dir)['data'].tolist()
    test_samples = [pd.read_json(datadir)['data'].tolist() for datadir in test_dirs]
    test_samples = [x for xs in test_samples for x in xs]

    test_samples = raw_samples_to_dataset_ee(test_samples)
    train_val_samples = raw_samples_to_dataset_ee(train_val_samples)

    if extensions is not None:
      train_val_samples = [*json.load(open(extensions)), *train_val_samples]

    div = 4*len(train_val_samples)//5
    train = lf.Dataset(train_val_samples[:div])
    val = lf.Dataset(train_val_samples[div:])
    test = lf.Dataset(test_samples)

    print(f"Configuring trainer with {len(train_val_samples[:div])} samples")
    train_dataloader = DataLoader(
      train.map(preprocessor),#.save(cachedir / "train.cache"),
      sampler=RandomSampler(train),
      batch_size=batch_size
    )
    print(f"Configuring val with {len(train_val_samples[div:])} samples")
    val_dataloader = DataLoader(
      val.map(preprocessor),#.save(cachedir / "val.cache"),
      sampler=SequentialSampler(val),
      batch_size=batch_size
    )
    print(f"Configuring test with {len(test_samples)} samples")
    test_dataloader = DataLoader(
      test.map(preprocessor),#.save(cachedir / "test.cache"),
      sampler=SequentialSampler(test),
      batch_size=batch_size
    )

    return train_dataloader, val_dataloader, test_dataloader

## Model Class

In [None]:
class Model(pl.LightningModule):

    def __init__(self, mode, model_id=MODEL_ID, model_trf=MODEL_TRF, num_labels=NUM_LABELS, **kwargs):
        super(Model, self).__init__()
        model = model_trf.from_pretrained(model_id, num_labels=num_labels)
        self.model = model

        dataloader = get_dataloader_race if mode == "RACE" else get_dataloader_ee
        train_dataloader, val_dataloader, test_dataloader = dataloader(**kwargs)
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader
        #self.automatic_optimization=False

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=0.02)


    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        #token_type_ids = batch["token_type_ids"]

        loss, _ = self.model(
                input_ids,
                #token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels,
                return_dict=False
                )

        tqdm_dict = {"train_loss": loss}
        output = OrderedDict({
            "loss": loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            })

        return output

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        #token_type_ids = batch["token_type_ids"]

        loss, logits = self.model(
                input_ids,
                #token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels,
                return_dict=False
                )
  
        labels_hat = torch.argmax(logits, dim=1)
        correct_count = torch.sum(labels == labels_hat)
        val_acc = correct_count/len(labels)
        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)
        self.log("val_loss", loss)
        self.log("val_acc", val_acc)
        output = OrderedDict({
                "val_loss": loss,
                "val_acc": val_acc,
                "correct_count": correct_count,
                "batch_size": len(labels)
                })

        return output
  
    def test_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        #token_type_ids = batch["token_type_ids"]

        loss, logits = self.model(
                input_ids,
                #token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels,
                return_dict=False
                )
  
        labels_hat = torch.argmax(logits, dim=1)
        correct_count = torch.sum(labels == labels_hat)

        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)

        self.log("test_acc", correct_count / len(labels))
        
    def validation_end(self, outputs):
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "val_loss": val_loss,
                "val_acc": val_acc,
                }
        self.log("val_loss", val_loss)
        self.log("val_acc", val_acc)
        return {"progress_bar": tqdm_dict, "log": tqdm_dict, "val_loss": val_loss}

    # @pl.data_loader
    def train_dataloader(self):
        return self._train_dataloader

    # @pl.data_loader
    def val_dataloader(self):
        return self._val_dataloader

    # @pl.data_loader
    def test_dataloader(self):
        return self._test_dataloader

## Train Model

### EE - no extensions

In [None]:
early_stop_callback = EarlyStopping(
  monitor="val_acc",
  min_delta=0.0,
  patience=4,
  verbose=True,
  mode="max",
)


trainer = pl.Trainer(
  gpus=1,
  callbacks=[early_stop_callback],
  #limit_train_batches=0.0001,
  #limit_test_batches=0.0001,
  #limit_val_batches=0.01,
  #max_epochs=5,
  # max_time="00:00:00:03", # max time: 1 minute
  #max_steps=1,
  default_root_dir='/content/drive/MyDrive/TFM/EntrranceExam/ee_checkpoints'
)


train_val_dir = '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2015.json'

results = []
for i in range(10):
  model = Model("ee", train_val_dir=train_val_dir)
  fitted = trainer.fit(model)
  results.append(trainer.test(model))


print(results)

In [None]:
import numpy as np
np.mean([res[0]['test_acc'] for res in results])

### EE - extensions

Experiment zero consists on texts generated artificially from the rc-test-english-2015 dataset + the original dataset. We're going to be generating 190 new answers, so effectively doing a x3 on the available data for training.
Let's see where we get.

In [None]:
early_stop_callback = EarlyStopping(
  monitor="val_acc",
  min_delta=0.0,
  patience=3,
  verbose=True,
  mode="max",
)

trainer = pl.Trainer(
  gpus=1,
  callbacks=[early_stop_callback])


train_val_dir = '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2015.json'
extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment0.json'


results_ext = []
for i in range(10):

  ext_model = Model("ee", train_val_dir=train_val_dir, extensions=extensions)

  fitted = trainer.fit(ext_model)
  results_ext.append(trainer.test(ext_model))

In [None]:
import numpy as np
np.mean([res[0]['test_acc'] for res in results_ext])

## EE - larger extensions

## Model checkpoints

In [None]:
loaded_model = Model.load_from_checkpoint(checkpoint_path="example.ckpt")

## Save model



In [None]:
model.training

In [None]:
model.model.save_pretrained("/content/drive/MyDrive/Dataset_reviews/baseline")

In [None]:
saved_model = MODEL_TRF.from_pretrained("baseline")

In [None]:
trainer = pl.Trainer(
  gpus=1,
  #callbacks=[early_stop_callback],
  #limit_train_batches=0.0001,
  #limit_test_batches=0.00001,
  # limit_val_batches=0.0,
  #max_epochs=5,
  max_time="00:00:00:03", # max time: 1 minute
  #max_steps=1
)

saved_model = Model("./RACE", model_id="patata")

In [None]:
saved_model.training

In [None]:
trainer.test(saved_model)

---
# 🤗 Transformers
---

## Pip

In [None]:
!pip install transformers datasets

## Imports

In [None]:
import torch
import json
import pandas as pd
import numpy as np

from dataclasses import dataclass
from typing import Optional, Union, Dict
from pathlib import Path
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

from datasets import load_dataset, Dataset, ClassLabel, DatasetDict
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

## DataCollator

In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=True,
            max_length=512,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

## Preprocess

In [None]:
MAX_LEN = 512
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
# RACE 
RACE_DIR = '/content/drive/MyDrive/TFM/RACE_DATASET/output_dir/RACE'

In [None]:
def append_option(question, option):
    return question.replace("_", f" {option}" ) if "_" in question else f"{question} {option}"

def preprocess(tokenizer, exam):
    first_sentences = [exam["article"]] * len(exam["options"])
    second_sentences = [append_option(exam["question"], option) for option in exam["options"]]

    return tokenizer(first_sentences, second_sentences, truncation=True)

def preprocess_dataset(ds, tokenizer):
  tokenized_ds = ds.map(partial(preprocess, tokenizer))
  tokenized_ds = tokenized_ds.rename_column("answer", "labels")
  if "train" in tokenized_ds:
    id_key = "id" if "id" in tokenized_ds["train"].features else "example_id"
  else:
    id_key = "id" if "id" in tokenized_ds.features else "example_id"

  tokenized_ds = tokenized_ds.remove_columns(["options", "question", "article", ])
  tokenized_ds = tokenized_ds.cast_column("labels", ClassLabel(num_classes=4, names=["A", "B", "C", "D"]))
  return tokenized_ds


def peak_encoding(ds, tokenizer):
  accepted_keys = ["input_ids", "attention_mask", "labels", "label" if "label" in  ds[0].keys() else "labels"]

  features = [{k: v for k, v in ds[i].items() if k in accepted_keys} for i in range(10)]
  batch = DataCollatorForMultipleChoice(tokenizer)(features)
  return [tokenizer.decode(batch["input_ids"][0][i].tolist()) for i in range(4)]

In [None]:
def raw_samples_to_dataset_ee(samples):
    datas = []
    for sample in samples:
      if "answers" in sample:
        for idx in range(len(sample["answers"])):
            _id = sample["id"] if "id" in sample else sample["example_id"]
            _article = sample["article"]
            _answer = sample["answers"][idx]
            _options = sample["options"][idx]
            _question = sample["questions"][idx]

            data = {
              "id": _id,
              "article": _article,
              "answer": _answer,
              "options": _options,
              "question": _question,
            }
            datas.append(data)
      else:
        datas.append(sample) # take into account autogenerated docs
            
    return datas

def load_ee(train_val_dir='/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2015.json', test_dirs=None, extensions=None, split=True):
    if test_dirs is None:
      test_dirs = [
        '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2013.json', 
        '/content/drive/MyDrive/TFM/EntranceExam/rc-test-english-2014.json',
      ]

    train_val_dir = Path(train_val_dir)

    train_val_samples = pd.read_json(train_val_dir)['data'].tolist()
    test_samples = [pd.read_json(datadir)['data'].tolist() for datadir in test_dirs]
    test_samples = [x for xs in test_samples for x in xs]

    test_samples = raw_samples_to_dataset_ee(test_samples)
    train_val_samples = raw_samples_to_dataset_ee(train_val_samples)

    if extensions is not None:
      train_val_samples = [*json.load(open(extensions)), *train_val_samples]

    if not split:
      return Dataset.from_pandas(pd.DataFrame([*train_val_samples, *test_samples]))
    else:
      div = 4*len(train_val_samples)//5
      return Dataset.from_pandas(pd.DataFrame(train_val_samples[:div])), Dataset.from_pandas(pd.DataFrame(train_val_samples[div:])), Dataset.from_pandas(pd.DataFrame(test_samples))

## Utils

In [None]:
def get_baseline_acc(model_name, tokenized_ds):
  model = AutoModelForMultipleChoice.from_pretrained(model_name)
  tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, truncation=True, use_fast=True)
  training_args = TrainingArguments(
    output_dir="./baseline",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      tokenizer=tokenizer,
      data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
      compute_metrics=compute_metrics
  )
  output = trainer.predict(tokenized_ds)
  print(f"Baseline accuracy for {model_name}")
  print(acc(tokenized_ds["labels"], [np.argmax(x) for x in output.predictions]))
  #return output

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

def acc(labels, predictions):
  return sum([l==p for l,p in zip(labels, predictions)]) / len(predictions)


---
# 🔮 Model baselines
---

## Distilbert base

In [None]:
race = load_dataset("race", "all")
ee = load_ee(split=False)

tokenizer_distilbert = AutoTokenizer.from_pretrained("LIAMF-USP/roberta-large-finetuned-race", use_fast=True, do_lower_case=True, truncation=True)
preprocess_dataset(ee, tokenizer_distilbert)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True, truncation=True, use_fast=True)

In [None]:
tokenized_race = preprocess_dataset(race, tokenizer)
tokenized_ee = preprocess_dataset(ee, tokenizer)

In [None]:
get_baseline_acc('distilbert-base-uncased', tokenized_ee)

In [None]:
get_baseline_acc('distilbert-base-uncased', tokenized_race["test"])

## bert-large-uncased

In [None]:
torch.cuda.empty_cache() 

In [None]:
tokenizer_bertlarge = AutoTokenizer.from_pretrained(
"bert-large-uncased", use_fast=True)
tokenized_eelarge = preprocess_dataset(ee, tokenizer_bertlarge)
get_baseline_acc('bert-large-uncased', tokenized_eelarge)

## roberta-large

In [None]:
!rm -rf /root/.cache/huggingface

In [None]:
tokenizer_roberta = AutoTokenizer.from_pretrained(
"roberta-large", use_fast=True)
tokenized_ee_roberta = preprocess_dataset(ee, tokenizer_roberta)
get_baseline_acc('roberta-large', tokenized_ee_roberta)

## roberta-large finetuned race

In [None]:
tokenized_ee_roberta = preprocess_dataset(ee, tokenizer_roberta)

get_baseline_acc('LIAMF-USP/roberta-large-finetuned-race', tokenized_ee_roberta)

In [None]:
torch.cuda.mem_get_info() 

In [None]:
ee_train, ee_val, ee_test = load_ee(extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment1.json', split=True)
tokenizer_distilbert = AutoTokenizer.from_pretrained("LIAMF-USP/roberta-large-finetuned-race", use_fast=True, do_lower_case=True, truncation=True)

ee_train = preprocess_dataset(ee_train, tokenizer_distilbert)
ee_val = preprocess_dataset(ee_val, tokenizer_distilbert)
ee_test = preprocess_dataset(ee_test, tokenizer_distilbert)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
)

trainer = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('LIAMF-USP/roberta-large-finetuned-race'),
    args=training_args,
    train_dataset=ee_train,
    eval_dataset=ee_val,
    tokenizer=tokenizer_distilbert,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_distilbert),
    compute_metrics=compute_metrics
)

trainer.train()
output = trainer.predict(ee_test)
acc(ee_test["labels"], [np.argmax(x) for x in output.predictions])

---
# 🧪   Experiments
---

## 📈 EntranceExams


### Baseline- no training

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 10
  })

trainer = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=ee_train,
    eval_dataset=ee_val,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

output_notrain = trainer.predict(ee_test)
acc(ee_test["labels"], [np.argmax(x) for x in output_notrain.predictions])

### Base case - training so little it has no effect

In [None]:
ee_train, ee_val, ee_test = load_ee(split=True)
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)

ee_train = preprocess_dataset(ee_train, tokenizer_bert)
ee_val = preprocess_dataset(ee_val, tokenizer_bert)
ee_test = preprocess_dataset(ee_test, tokenizer_bert)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 10
  })

outputs = []
for i in range(10):
  trainer = Trainer(
      model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
      args=training_args,
      train_dataset=ee_train,
      eval_dataset=ee_val,
      data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
      compute_metrics=compute_metrics
  )

  trainer.train()
  output_ee = trainer.predict(ee_test)
  outputs.append(acc(ee_test["labels"], [np.argmax(x) for x in output_ee.predictions]))

print(f"Mean of outputs is {np.mean(outputs)}")

In [None]:
outputs

### With Extension -1st experiment - worse

In [None]:
ee_train, ee_val, ee_test = load_ee(extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment0.json', split=True)
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)

ee_train = preprocess_dataset(ee_train, tokenizer_bert)
ee_val = preprocess_dataset(ee_val, tokenizer_bert)
ee_test = preprocess_dataset(ee_test, tokenizer_bert)

training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 10
  })

outputs_ext = []
for i in range(10):
  trainer = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=ee_train,
    eval_dataset=ee_val,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_distilbert),
    compute_metrics=compute_metrics
  )
  trainer.train()
  output_ee_ext = trainer.predict(ee_test)
  outputs_ext.append(acc(ee_test["labels"], [np.argmax(x) for x in output_ee_ext.predictions]))

print(f"Mean of outputs is {np.mean(outputs_ext)}")

### With Extension - 2nd experiment - even worse

In [None]:
ee_train, ee_val, ee_test = load_ee(extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment1.json', split=True)
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)

ee_train = preprocess_dataset(ee_train, tokenizer_bert)
ee_val = preprocess_dataset(ee_val, tokenizer_bert)
ee_test = preprocess_dataset(ee_test, tokenizer_bert)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 1000
  })

trainer = Trainer(
  model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
  args=training_args,
  train_dataset=ee_train,
  eval_dataset=ee_val,
  data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
  compute_metrics=compute_metrics
)
trainer.train()
output_ee_ext = trainer.predict(ee_test)
acc(ee_test["labels"], [np.argmax(x) for x in output_ee_ext.predictions])

### With Extensions : sentences

In [None]:
ee_train, ee_val, ee_test = load_ee(extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment2-sent.json', split=True)
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)

ee_train = preprocess_dataset(ee_train, tokenizer_bert)
ee_val = preprocess_dataset(ee_val, tokenizer_bert)
ee_test = preprocess_dataset(ee_test, tokenizer_bert)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 1000
  })

trainer = Trainer(
  model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
  args=training_args,
  train_dataset=ee_train,
  eval_dataset=ee_val,
  data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
  compute_metrics=compute_metrics
)
trainer.train()
output_ee_ext = trainer.predict(ee_test)
acc(ee_test["labels"], [np.argmax(x) for x in output_ee_ext.predictions])

## 🏃 RACE

### Race - Bert Base 5k (all)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 100
  })


race = load_dataset("race", "all")
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race = preprocess_dataset(race, tokenizer_bert)

bert_base_5k = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["train"]).head(5000)),
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).head(1000)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_5k.train()
output = bert_base_5k.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

### RACE- 5k middle

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "/content/drive/MyDrive/TFM/models/results5_middle",
    "eval_steps": 100
  })


race = load_dataset("race", "middle")
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race = preprocess_dataset(race, tokenizer_bert)

bert_base_5k_middle = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["train"]).head(5000)),
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).head(1000)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_5k_middle.train()

In [None]:
output = bert_base_5k_middle.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

### RACE - 5k middle + 5k synthetic from those 5k (sent) .56

In [None]:
extension_sents = pd.read_json(open('/content/drive/MyDrive/TFM/RACE_DATASET/race_extensions/train_15k_sent.json'))
extension_sents["example_id"] = extension_sents["id"]
del extension_sents["id"]

race = load_dataset("race", "middle")

df_joined = pd.concat([race["train"].to_pandas().head(5000), extension_sents])

Sanity check : ids are the same

In [None]:
len(df_joined["example_id"].apply(lambda x: x.split("middle")[1]).unique()) == len(race["train"].to_pandas().head(5000)["example_id"].apply(lambda x: x.split("middle")[1]).unique())

In [None]:
df_joined.shape

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race_5k_5ksent = preprocess_dataset(Dataset.from_pandas(df_joined), tokenizer_bert)

Please note: we validate **only against real data** (from another split, intersection with taining data is void)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 2,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "/content/drive/MyDrive/TFM/models/results5k_5ksynth",
    "eval_steps": 100
  })


race = load_dataset("race", "middle")
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race = preprocess_dataset(race, tokenizer_bert)

bert_base_5k_5ksent = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=tokenized_race_5k_5ksent,
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).head(1000)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_5k_5ksent.train()

In [None]:
output = bert_base_5k_5ksent.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

### RACE - 5k middle + 5k synthetic from those 5k (words)

In [None]:
extension_words = pd.read_json(open('/content/drive/MyDrive/TFM/RACE_DATASET/race_extensions/train_5k_words.json'))
extension_words["example_id"] = extension_words["id"]
del extension_words["id"]

race = load_dataset("race", "middle")

df_words = pd.concat([race["train"].to_pandas().head(5000), extension_words])
len(df_words["example_id"].apply(lambda x: x.split("middle")[1]).unique()) == len(race["train"].to_pandas().head(5000)["example_id"].apply(lambda x: x.split("middle")[1]).unique())

In [None]:
df_words.shape

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race_5k_5kwords = preprocess_dataset(Dataset.from_pandas(df_words), tokenizer_bert)

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 2,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "/content/drive/MyDrive/TFM/models/results5k_5ksynth",
    "eval_steps": 100
  })


race = load_dataset("race", "middle")
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race = preprocess_dataset(race, tokenizer_bert)

bert_base_5k_5kwords = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=tokenized_race_5k_5kwords,
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).head(1000)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_5k_5kwords.train()

In [None]:
output = bert_base_5k_5kwords.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

### RACE - 10k middle

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 2,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "/content/drive/MyDrive/TFM/models/results10_middle",
    "eval_steps": 250
  })


race = load_dataset("race", "middle")
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, do_lower_case=True, truncation=True)
tokenized_race = preprocess_dataset(race, tokenizer_bert)

bert_base_10k_middle = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["train"]).head(10000)),
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).head(1000)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_10k_middle.train()

In [None]:
output = bert_base_10k_middle.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

---
# ✅ Synthetic Datasets Evaluation
---

Do they make sense? We can infer that they're at least somewhat predictable

## EE original

In [None]:
ee = load_ee(split=False)
ee = preprocess_dataset(ee, tokenizer_bert)
outputee = bert_base_5k.predict(ee)
acc(ee["labels"], [np.argmax(x) for x in outputee.predictions])

In [None]:
ee = load_ee(split=False)
ee = preprocess_dataset(ee, tokenizer_bert)
outputee = bert_base_5k_5ksent.predict(ee)
acc(ee["labels"], [np.argmax(x) for x in outputee.predictions])

## EE 100

In [None]:
extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment0.json'
trainy = Dataset.from_pandas(pd.read_json(extensions))
trainy = preprocess_dataset(trainy, tokenizer_bert)
output2 = bert_base_5k_middle.predict(trainy)
acc(trainy["labels"], [np.argmax(x) for x in output2.predictions])

## EE 1000

In [None]:
extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment1.json'
trainy = Dataset.from_pandas(pd.read_json(extensions))
trainy = preprocess_dataset(trainy, tokenizer_bert)
output3 = bert_base_5k_middle.predict(trainy)
acc(trainy["labels"], [np.argmax(x) for x in output3.predictions])

## EE 100 Sentences

In [None]:
extensions='/content/drive/MyDrive/TFM/EntranceExam/ee_cache_en/experiment2-sent.json'
trainy = Dataset.from_pandas(pd.read_json(extensions))
trainy = preprocess_dataset(trainy, tokenizer_bert)
output3 = bert_base_5k_middle.predict(trainy)
acc(trainy["labels"], [np.argmax(x) for x in output3.predictions])

## RACE-high - words - 3k vs RACE-middle 5k

In [None]:
from os import walk

mypath = '/content/drive/MyDrive/TFM/RACE_DATASET/race_extensions/first_poc/high/'


f = []
for (dirpath, dirnames, filenames) in walk(mypath):
    f.extend(filenames)

In [None]:
extensions = Dataset.from_pandas(pd.DataFrame([json.load(open(mypath + fl)) for fl in f]))
trainy = preprocess_dataset(extensions, tokenizer_bert)
output_highhigh = bert_base_5k.predict(trainy)
acc(trainy["labels"], [np.argmax(x) for x in output_highhigh.predictions])

## Training over Synthetic RACE

It has less preditive power, but still better than baseline.

In [None]:
training_args = TrainingArguments(**{
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "fp16_opt_level": "O1",
    "save_total_limit": 0,
    "save_steps": 0,
    "evaluation_strategy": "steps",
    "num_train_epochs": 3,
    "per_device_eval_batch_size": 8,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-05,
    "warmup_steps": 500,
    "output_dir" : "./results5",
    "eval_steps": 100
  })


bert_base_3k_synth = Trainer(
    model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased'),
    args=training_args,
    train_dataset=trainy,
    eval_dataset=Dataset.from_pandas(Dataset.to_pandas(tokenized_race["validation"]).tail(620)),
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_bert),
    compute_metrics=compute_metrics
)

bert_base_3k_synth.train()

output = bert_base_3k_synth.predict(tokenized_race["test"])
acc(tokenized_race["test"]["labels"], [np.argmax(x) for x in output.predictions])

How does this model predict over ee?

In [None]:
ee = load_ee(split=False)
ee = preprocess_dataset(ee, tokenizer_bert)
output_eesynth = bert_base_3k_synth.predict(ee)
acc(ee["labels"], [np.argmax(x) for x in output_eesynth.predictions])

Worse than baseline, but better than the ee synthetic ones.