In [1]:
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error
import pytorch_lightning as pl
import torch
from torch import nn
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from lightning.pytorch.core.module import LightningModule
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
import pandas as pd
from IPython.display import display
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.loggers.csv_logs import CSVLogger
from coral_pytorch.losses import corn_loss
from coral_pytorch.dataset import corn_label_from_logits
import os


RANDOM_SEED = 42
REFERENCE_CONCEPT = 0
MAX_LENGTH = 512
BATCH_SIZE=16
WEIGHT_DECAY=0.01
GRADIENT_ACC = 1
LEARNING_RATE = 1e-4
PRECISION = "16-mixed"
OBJECTIVE = "classification"
VARIANT = "base"
TOKENIZER_NAME = f"neuralmind/bert-{VARIANT}-portuguese-cased"
MODEL_NAME = f"kamel-usp/aes_enem_models-sourceB-ordinal-{VARIANT}-C{REFERENCE_CONCEPT+1}"
BASE_MODEL = "sourceB-ordinal"
EXPERIMENT_NAME = f"aes_enem_models-sourceA-{OBJECTIVE}-from-{BASE_MODEL}-{VARIANT}-C{REFERENCE_CONCEPT+1}"

pl.seed_everything(RANDOM_SEED)
torch.set_float32_matmul_precision('medium')

Seed set to 42


In [2]:
dataset = load_dataset("kamel-usp/aes_enem_dataset", "sourceAWithGraders", cache_dir="/tmp/aes_enem")

ConnectionError: Couldn't reach 'kamel-usp/aes_enem_dataset' on the Hub (ConnectionError)

In [4]:
grade_mapping = {
    0: 0,
    40: 1,
    80: 2,
    120: 3,
    160: 4,
    200: 5,
}

def create_label(row):
    grade = row["grades"][REFERENCE_CONCEPT]
    return {"label": grade_mapping[grade]}

dataset = dataset.map(create_label)

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/195 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

In [5]:
def compute_difference(lists):
    # Assuming the first element is the reference for subtraction
    reference = lists[0][REFERENCE_CONCEPT]
    grader_a = lists[1][REFERENCE_CONCEPT]
    grader_b = lists[2][REFERENCE_CONCEPT]

    # Calculate absolute differences
    diff_ref_a = abs(reference - grader_a)
    diff_ref_b = abs(reference - grader_b)
    diff_a_b = abs(grader_a - grader_b)

    # Check if any difference is greater than 80
    return diff_ref_a > 80 or diff_ref_b > 80 or diff_a_b > 80

test_df = dataset["test"].to_pandas()
new_test_df = pd.merge(
    test_df.groupby(["id_prompt", "id"]).agg({"grades": list}).apply(lambda x: compute_difference(x['grades']), axis=1).reset_index(),
    test_df,
    on=["id_prompt","id"]
).rename(columns={0: "is_hard"})

In [6]:
dataset["test_easy"] = Dataset.from_pandas(new_test_df[new_test_df["is_hard"]==False])
dataset["test_hard"] = Dataset.from_pandas(new_test_df[new_test_df["is_hard"]==True])

In [7]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
if OBJECTIVE == "regression":
    model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            cache_dir="/tmp/", 
            num_labels=1,
        )
elif OBJECTIVE == "classification" or OBJECTIVE == "ordinal":
    model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            cache_dir="/tmp/", 
            num_labels=6,
        )
else:
    raise ValueError("Please set a Pre defined Objectice")

In [8]:
def prepare_dataset(dataset):
    def tokenize_essays(dataset, tokenizer, max_length=512):
        tokenized_text = tokenizer(
                dataset["essay_text"],
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )
        tokenized_text["label"] = dataset["label"]
        return tokenized_text
    
    tokenized_datasets = {
        split: tokenize_essays(sub_dataset, tokenizer, MAX_LENGTH)
        for split, sub_dataset in dataset.items()
    }
    dataset_tokenized = DatasetDict({
        split: Dataset.from_dict(data)
        for split, data in tokenized_datasets.items()
    })

    return dataset_tokenized

dataset_tokenized = prepare_dataset(dataset)

In [9]:
data_train = DataLoader(
    dataset_tokenized["train"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=True, num_workers=0
)
data_val = DataLoader(dataset_tokenized["validation"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
data_test = DataLoader(dataset_tokenized["test"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

data_test_easy = DataLoader(dataset_tokenized["test_easy"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
data_test_hard = DataLoader(dataset_tokenized["test_hard"].with_format("torch"), batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [10]:
def predict_classes(output):
    if OBJECTIVE == "regression":
        # Round the tensor to the nearest integer
        rounded_tensor = torch.round(output.logits)
        # Clamp the values to the range [0, 5]
        clamped_tensor = torch.clamp(rounded_tensor, min=0, max=5)
        return clamped_tensor.view(-1)
    elif OBJECTIVE == "classification":
        return torch.argmax(output.logits, axis=1)
    elif OBJECTIVE == "ordinal":
        return corn_label_from_logits(output.logits)
        
def get_predictions_and_labels(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    all_predictions = []
    all_true_labels = []
    i=0
    for batch in tqdm(dataloader, desc="Obtaining predictions"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            predicted_classes = predict_classes(output) 

        # If using GPU, need to move the data back to CPU to use numpy.
        all_predictions.extend(predicted_classes.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

    return list(map(lambda x: x * 40, all_predictions)), list(map(lambda x: x * 40, all_true_labels))

all_predictions, all_true_labels = get_predictions_and_labels(model, data_val)

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

In [11]:
def enem_accuracy_score(true_values, predicted_values):
    assert len(true_values) == len(predicted_values), "Mismatched length between true and predicted values."

    non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 80])
    
    return non_divergent_count / len(true_values)

In [12]:
accuracy = accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation  set: {accuracy:.2f}")

Accuracy on the validation  set: 0.46


In [13]:
qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
print(f"QWK on the validation set: {qwk:.2f}")

QWK on the validation set: 0.28


In [14]:
enem_accuracy = enem_accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation set: {enem_accuracy:.2f}")

Accuracy on the validation set: 0.94


In [19]:
class LightningGradePredictor(LightningModule):
    def __init__(self, model: nn.Module, loss_function, learning_rate=0.001, num_warmup_steps=None, num_training_steps=None):
        super(LightningGradePredictor, self).__init__()
        self.model = model
        self.loss = loss_function
        self.learning_rate = learning_rate
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
        self.save_hyperparameters(ignore=['model'])
        self.training_step_loss = []
        self.validation_step_loss = []

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        loss = None
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits, labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('train_loss', loss)
        self.training_step_loss.append(loss)
        optimizer = self.optimizers()
        if isinstance(optimizer, list):
            optimizer = optimizer[0]
        lr_current = optimizer.optimizer.param_groups[0]['lr']
        self.log('current_lr', lr_current)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits, labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('val_loss', loss)
        self.validation_step_loss.append(loss)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits,  labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate, weight_decay=WEIGHT_DECAY)

        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.num_warmup_steps,
            num_training_steps=self.num_training_steps
        )

        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [20]:
class EpochEndCallback(Callback):
    def __init__(self):
        self.metrics_df = pd.DataFrame(columns=
                                       ['Epoch', 'Train Loss', 'Validation Loss',
                                         'Train QWK', 'Validation QWK',
                                         'Train RMSE', 'Validation RMSE'
                                        ])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def on_train_epoch_end(self, trainer, pl_module):
        current_epoch = trainer.current_epoch
        # Metrics for training data
        train_dataloader = trainer.train_dataloader
        val_dataloader = trainer.val_dataloaders
        
        epoch_train_loss_mean = torch.stack(pl_module.training_step_loss).mean()
        epoch_val_loss_mean = torch.stack(pl_module.validation_step_loss).mean()
        pl_module.training_step_loss.clear()
        pl_module.validation_step_loss.clear()
        model.eval()
        train_predictions, train_true_labels = get_predictions_and_labels(model, train_dataloader)
        val_predictions, val_true_labels = get_predictions_and_labels(model, val_dataloader)
        train_qwk = cohen_kappa_score(train_true_labels, train_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
        val_qwk = cohen_kappa_score(val_true_labels, val_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
        pl_module.log('val_qwk', val_qwk)

        train_rmse = mean_squared_error(train_true_labels, train_predictions)
        val_rmse = mean_squared_error(val_true_labels, val_predictions)
        train_rmse = np.sqrt(train_rmse)
        val_rmse = np.sqrt(val_rmse)
        pl_module.log('val_rmse', val_rmse)

        new_row = {'Epoch': current_epoch, 
                    'Train Loss': epoch_train_loss_mean.cpu().detach().numpy(),
                    'Validation Loss': epoch_val_loss_mean.cpu().detach().numpy(),
                    'Train QWK': train_qwk,
                    'Validation QWK': val_qwk,
                    'Train RMSE': train_rmse,
                    'Validation RMSE': val_rmse
                    }
        new_row = pd.Series(new_row).to_frame().T
        self.metrics_df = pd.concat([self.metrics_df, new_row])
        display(self.metrics_df)

In [21]:
early_stop_callback = EarlyStopping(monitor="val_qwk", patience=3, verbose=True, mode="max")
checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_qwk", mode="max")

In [22]:
def get_loss():
    if OBJECTIVE == "regression":
        return torch.nn.functional.mse_loss
    elif OBJECTIVE == "classification":
        return torch.nn.functional.cross_entropy
    elif OBJECTIVE == "ordinal":
        return corn_loss
       
loss_function = get_loss() 
NUM_EPOCHS = 20
num_training_steps = NUM_EPOCHS * len(data_train)
warmup_steps = int(num_training_steps * 0.1)
steps_per_epoch = num_training_steps//NUM_EPOCHS
model.train()
trainer = Trainer(
    max_epochs=NUM_EPOCHS, 
    log_every_n_steps=steps_per_epoch, 
    logger=CSVLogger("model_logs", name=EXPERIMENT_NAME), 
    callbacks=[EpochEndCallback(), early_stop_callback, checkpoint_callback],
    precision=PRECISION,
    accumulate_grad_batches=GRADIENT_ACC
    )
ligthning_model = LightningGradePredictor(model, 
                    loss_function=loss_function,
                    num_training_steps=num_training_steps,
                    num_warmup_steps=warmup_steps,
                    learning_rate = LEARNING_RATE
                    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
trainer.fit(model=ligthning_model, train_dataloaders=data_train, val_dataloaders=data_val)

Missing logger folder: model_logs/aes_enem_models-sourceA-ordinal-from-sourceB-ordinal-base-C5
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.711   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/andrebarbosa/miniconda3/envs/aes_enem/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.
/home/andrebarbosa/miniconda3/envs/aes_enem/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/47 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.57109076,0.6423257,0.498858,0.286512,48.725697,58.212696


Metric val_qwk improved. New best score: 0.287


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/47 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.57109076,0.6423257,0.498858,0.286512,48.725697,58.212696
0,1,0.46558607,0.55239934,0.6764,0.429161,45.365896,54.500059


Metric val_qwk improved by 0.143 >= min_delta = 0.0. New best score: 0.429


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/47 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.57109076,0.6423257,0.498858,0.286512,48.725697,58.212696
0,1,0.46558607,0.55239934,0.6764,0.429161,45.365896,54.500059
0,2,0.43214744,0.5861577,0.632701,0.373088,46.25781,55.247694


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/47 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.57109076,0.6423257,0.498858,0.286512,48.725697,58.212696
0,1,0.46558607,0.55239934,0.6764,0.429161,45.365896,54.500059
0,2,0.43214744,0.5861577,0.632701,0.373088,46.25781,55.247694
0,3,0.4116666,0.5732384,0.705568,0.399249,44.091791,55.02447


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/47 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.57109076,0.6423257,0.498858,0.286512,48.725697,58.212696
0,1,0.46558607,0.55239934,0.6764,0.429161,45.365896,54.500059
0,2,0.43214744,0.5861577,0.632701,0.373088,46.25781,55.247694
0,3,0.4116666,0.5732384,0.705568,0.399249,44.091791,55.02447
0,4,0.37811443,0.64613014,0.675218,0.319549,44.334991,55.912019


Monitored metric val_qwk did not improve in the last 3 records. Best score: 0.429. Signaling Trainer to stop.


In [23]:
tmp = "model_logs/aes_enem_models-sourceA-from-bertimbau-base/version_0/checkpoints/epoch=7-step=376.ckpt"

In [25]:
best_model = LightningGradePredictor.load_from_checkpoint(tmp, 
                                                  model=ligthning_model.model, 
                                                  loss_function=ligthning_model.loss)
best_model.model.eval()
all_predictions, all_true_labels = get_predictions_and_labels(best_model.model, data_val)
rmse_val = mean_squared_error(all_true_labels, all_predictions, squared=False)
print(f"RMSE on the validation set: {rmse_val: .4f}")

Obtaining predictions:   0%|          | 0/13 [00:00<?, ?it/s]

RMSE on the validation set:  125.4816


In [20]:
def compute_metrics(model, dataset, test_group):
    all_predictions, all_true_labels = get_predictions_and_labels(model, dataset)
    accuracy = accuracy_score(all_true_labels, all_predictions)
    qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200]) 
    rmse = mean_squared_error(all_true_labels, all_predictions, squared=False)
    horizontal_discrepancy = enem_accuracy_score(all_true_labels, all_predictions)
    result = {
        'Experiment Reference': EXPERIMENT_NAME,
        'Test Group': test_group,
        'Competence': REFERENCE_CONCEPT,
        'Accuracy': [accuracy],
        'RMSE': [rmse],
        'QWK': [qwk],
        'HDIV': [1- horizontal_discrepancy]
    }
    return pd.DataFrame(result)

In [23]:
def generate_report(model, data_test, data_test_easy, data_test_hard):
    result = pd.concat([
        compute_metrics(model, data_test, "full"),
        compute_metrics(model, data_test_easy, "easy"),
        compute_metrics(model, data_test_hard, "hard")
    ])
    directory = "experiment_reports"
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = f"{directory}/{EXPERIMENT_NAME}.csv"
    file_exists = os.path.isfile(file_path) and os.path.getsize(file_path) > 0
    # if file_exists:
         # result.to_csv(file_path, mode='a', header=False, index=False)
    # else:
        # result.to_csv(file_path, mode='w', header=True, index=False)
    return result

In [24]:
generate_report(best_model.model, data_test, data_test_easy, data_test_hard)

Obtaining predictions:   0%|          | 0/14 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/12 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,aes_enem_models-sourceA-ordinal-from-sourceB-o...,full,4,0.203704,94.946379,-0.03648,0.351852
0,aes_enem_models-sourceA-ordinal-from-sourceB-o...,easy,4,0.194444,95.125648,-0.041077,0.338889
0,aes_enem_models-sourceA-ordinal-from-sourceB-o...,hard,4,0.25,94.044907,-0.013582,0.416667


In [23]:
bert_save_path = f"model_checkpoints/hugging_face/finetuning/{EXPERIMENT_NAME}"
bert_save_path

'model_checkpoints/hugging_face/finetuning/aes_enem_models-sourceA-ordinal-from-sourceB-ordinal-base-C5'

In [24]:
best_model.model.save_pretrained(bert_save_path)

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
            bert_save_path, 
            cache_dir="/tmp/", 
            num_labels=6,
        )
compute_metrics(model, data_test, "full")

Obtaining predictions:   0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,aes_enem_models-sourceA-ordinal-from-sourceB-o...,full,4,0.319444,50.184844,0.53211,0.037037
