# Inference Validation Notebook

TL;DR - 

This notebook is a general purposes that serves to run [trained models](https://huggingface.co/kamel-usp) so we can validate reported results

In [1]:
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error
import torch
from tqdm.auto import tqdm
import numpy as np
import random
import pandas as pd
from coral_pytorch.dataset import corn_label_from_logits
torch.use_deterministic_algorithms(True)

RANDOM_SEED = 42
REFERENCE_CONCEPT = 0
OBJECTIVE = "ordinal"
MAX_LENGTH = 512
BATCH_SIZE=16
VARIANT = "base" #base/large
TOKENIZER_NAME = f"neuralmind/bert-{VARIANT}-portuguese-cased"
BASE_MODEL = "bertimbau" #bertimbau/sourceB-mlm/sourceB-ordinal
MODEL_NAME = f"kamel-usp/aes_enem_models-sourceA-ordinal-from-{BASE_MODEL}-{VARIANT}-C{REFERENCE_CONCEPT+1}"
torch.manual_seed(RANDOM_SEED)

MODEL_NAME

'kamel-usp/aes_enem_models-sourceA-ordinal-from-bertimbau-base-C1'

In [2]:
dataset = load_dataset("kamel-usp/aes_enem_dataset", "sourceAWithGraders", cache_dir="/tmp/aes_enem")

Downloading builder script:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing HTML files from: sourceAWithGraders:   0%|          | 0/44 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'id_prompt', 'essay_title', 'essay_text', 'grades', 'essay_year'],
        num_rows: 738
    })
    validation: Dataset({
        features: ['id', 'id_prompt', 'essay_title', 'essay_text', 'grades', 'essay_year'],
        num_rows: 204
    })
    test: Dataset({
        features: ['id', 'id_prompt', 'essay_title', 'essay_text', 'grades', 'essay_year'],
        num_rows: 213
    })
})

In [4]:
dataset["test"][0]

{'id': '5.html',
 'id_prompt': 'o-brasil-paralisado-o-que-voce-pensa-sobre-a-greve-dos-caminhoneiros',
 'essay_title': 'O Brasil e seus conflitos',
 'essay_text': 'É de conhecimento geral que a notícia se espalha pelo Brasil inteiro, pois a manifestação está sendo falada em todo o território nacional. Com base nos conhecimentos os caminhoneiros estão reivindicando seus direitos. Em consequência disso, vê-se a todo instante reportagens que vem a julgar a manifestação, no entanto, dizem que os caminhoneiros estao causando desordem entre a população, nota-se qeos grevistas não estão apenas manifestando para si, e sim àqueles que não tiveram coragem de sair nas ruas a declarar sua indignação. Em todos os canais há notas de esclarecimento que devido a greve, está faltando remédios em hospitais, doadores, etc. Isso é praticamente um meio de não culpar os governantes e querem pôr culpa nos caminhoneiros. Em virtudes dos fatos mencionados, conclui-se que há uma esperança de vida melhor assim q

In [5]:
grade_mapping = {
    0: 0,
    40: 1,
    80: 2,
    120: 3,
    160: 4,
    200: 5,
}

def create_label(row):
    grade = row["grades"][REFERENCE_CONCEPT]
    return {"label": grade_mapping[grade]}

dataset = dataset.map(create_label)

Map:   0%|          | 0/738 [00:00<?, ? examples/s]

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

In [6]:
def compute_difference(lists):
    # Assuming the first element is the reference for subtraction
    reference = lists[0][REFERENCE_CONCEPT]
    grader_a = lists[1][REFERENCE_CONCEPT]
    grader_b = lists[2][REFERENCE_CONCEPT]

    # Calculate absolute differences
    diff_ref_a = abs(reference - grader_a)
    diff_ref_b = abs(reference - grader_b)
    diff_a_b = abs(grader_a - grader_b)

    # Check if any difference is greater than 80
    return diff_ref_a > 80 or diff_ref_b > 80 or diff_a_b > 80

test_df = dataset["test"].to_pandas()
new_test_df = pd.merge(
    test_df.groupby(["id_prompt", "id"]).agg({"grades": list}).apply(lambda x: compute_difference(x['grades']), axis=1).reset_index(),
    test_df,
    on=["id_prompt","id"]
).rename(columns={0: "is_hard"})

In [7]:
dataset["test_easy"] = Dataset.from_pandas(new_test_df[new_test_df["is_hard"]==False])
dataset["test_hard"] = Dataset.from_pandas(new_test_df[new_test_df["is_hard"]==True])

In [8]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
def get_model_instance(model_path, objective):
    model = None
    if objective == "regression":
        model = AutoModelForSequenceClassification.from_pretrained(
                model_path, 
                cache_dir="/tmp/", 
                num_labels=1,
            )
    elif objective == "classification" or objective == "ordinal":
        model = AutoModelForSequenceClassification.from_pretrained(
                model_path, 
                cache_dir="/tmp/aes_enem2", 
                num_labels=6,
            )
    return model
model = get_model_instance(MODEL_NAME, OBJECTIVE)
if model is None:
    raise ValueError("Please set a Pre defined Objective")

In [9]:
def prepare_dataset(dataset):
    def tokenize_essays(dataset, tokenizer, max_length=512):
        tokenized_text = tokenizer(
                dataset["essay_text"],
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )
        tokenized_text["label"] = dataset["label"]
        return tokenized_text
    
    tokenized_datasets = {
        split: tokenize_essays(sub_dataset, tokenizer, MAX_LENGTH)
        for split, sub_dataset in dataset.items()
    }
    dataset_tokenized = DatasetDict({
        split: Dataset.from_dict(data)
        for split, data in tokenized_datasets.items()
    })

    return dataset_tokenized

dataset_tokenized = prepare_dataset(dataset)

In [10]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [11]:
g = torch.Generator()
g.manual_seed(RANDOM_SEED)

data_train = DataLoader(
    dataset_tokenized["train"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=True, num_workers=0,
    worker_init_fn=seed_worker,
    generator=g
)
data_val = DataLoader(dataset_tokenized["validation"].with_format("torch"), batch_size=BATCH_SIZE,
                       shuffle=False, num_workers=0,worker_init_fn=seed_worker,generator=g)
data_test = DataLoader(dataset_tokenized["test"].with_format("torch"), batch_size=BATCH_SIZE,
                       shuffle=False, num_workers=0,worker_init_fn=seed_worker,generator=g)

data_test_easy = DataLoader(dataset_tokenized["test_easy"].with_format("torch"), batch_size=BATCH_SIZE,
                            shuffle=False, num_workers=0,worker_init_fn=seed_worker,generator=g)
data_test_hard = DataLoader(dataset_tokenized["test_hard"].with_format("torch"), batch_size=BATCH_SIZE,
                            shuffle=False, num_workers=0,worker_init_fn=seed_worker,generator=g)

In [13]:
def predict_classes(output):
    if OBJECTIVE == "regression":
        # Round the tensor to the nearest integer
        rounded_tensor = torch.round(output.logits)
        # Clamp the values to the range [0, 5]
        clamped_tensor = torch.clamp(rounded_tensor, min=0, max=5)
        return clamped_tensor.view(-1)
    elif OBJECTIVE == "classification":
        return torch.argmax(output.logits, axis=1)
    elif OBJECTIVE == "ordinal":
        return corn_label_from_logits(output.logits)
        
def get_predictions_and_labels(model, dataloader):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cpu"
    model.to(device)
    all_predictions = []
    all_true_labels = []
    i=0
    for batch in tqdm(dataloader, desc="Obtaining predictions"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            predicted_classes = predict_classes(output) 

        # If using GPU, need to move the data back to CPU to use numpy.
        all_predictions.extend(predicted_classes.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

    return list(map(lambda x: x * 40, all_predictions)), list(map(lambda x: x * 40, all_true_labels))

In [14]:
def enem_accuracy_score(true_values, predicted_values):
    assert len(true_values) == len(predicted_values), "Mismatched length between true and predicted values."

    non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 80])
    
    return non_divergent_count / len(true_values)

In [15]:
all_predictions, all_true_labels = get_predictions_and_labels(model, data_test)
accuracy = accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation  set: {accuracy:.2f}")

Obtaining predictions:   0%|          | 0/14 [00:00<?, ?it/s]

Accuracy on the validation  set: 0.53


In [16]:
qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
print(f"QWK on the validation set: {qwk:.2f}")

QWK on the validation set: 0.46


In [17]:
enem_accuracy = enem_accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation set: {enem_accuracy:.2f}")

Accuracy on the validation set: 0.94


In [18]:
def compute_metrics(model, dataset, test_group):
    all_predictions, all_true_labels = get_predictions_and_labels(model, dataset)
    accuracy = accuracy_score(all_true_labels, all_predictions)
    qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200]) 
    rmse = mean_squared_error(all_true_labels, all_predictions, squared=False)
    horizontal_discrepancy = enem_accuracy_score(all_true_labels, all_predictions)
    result = {
        'Experiment Reference': MODEL_NAME,
        'Test Group': test_group,
        'Competence': REFERENCE_CONCEPT,
        'Accuracy': [accuracy],
        'RMSE': [rmse],
        'QWK': [qwk],
        'HDIV': [1- horizontal_discrepancy]
    }
    return pd.DataFrame(result)

In [19]:
model.eval()
with torch.no_grad():
    display(compute_metrics(model, data_test, "full"))
    display(compute_metrics(model, data_test_easy, "easy"))

Obtaining predictions:   0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,kamel-usp/aes_enem_models-sourceA-ordinal-from...,full,0,0.525822,43.074343,0.463946,0.056338


Obtaining predictions:   0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,kamel-usp/aes_enem_models-sourceA-ordinal-from...,easy,0,0.574713,29.556103,0.552396,0.0


In [20]:
MODEL_NAME

'kamel-usp/aes_enem_models-sourceA-ordinal-from-bertimbau-base-C1'