In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.core.module import LightningModule
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.loggers.csv_logs import CSVLogger
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, Callback
import pandas as pd
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch import nn
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_cosine_schedule_with_warmup
from coral_pytorch.losses import corn_loss
from coral_pytorch.dataset import corn_label_from_logits
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error
import numpy as np

In [2]:
RANDOM_SEED = 42
REFERENCE_CONCEPT = 3
BATCH_SIZE = 16
GRADIENT_ACC= 1
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
MAX_LENGTH = 512
PRECISION = "16-mixed"
OBJECTIVE = "ordinal"
VARIANT = "base"
TOKENIZER_NAME = f"neuralmind/bert-{VARIANT}-portuguese-cased"
MODEL_NAME =  f"neuralmind/bert-{VARIANT}-portuguese-cased"
EXPERIMENT_NAME = f"aes_enem_models-sourceB-{OBJECTIVE}-{VARIANT}-C{REFERENCE_CONCEPT+1}"

pl.seed_everything(RANDOM_SEED)
torch.set_float32_matmul_precision('medium')

Seed set to 42


In [3]:
dataset = load_dataset("kamel-usp/aes_enem_dataset", "sourceB", cache_dir="/tmp/aes_enem")

In [4]:
grade_mapping = {
    0: 0,
    40: 1,
    80: 2,
    120: 3,
    160: 4,
    200: 5,
}

def create_label(row):
    grade = row["grades"][REFERENCE_CONCEPT]
    return {"label": grade_mapping[grade]}

dataset = dataset.map(create_label)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
def prepare_dataset(dataset):
    def tokenize_essays(dataset, tokenizer, max_length=512):
        tokenized_text = tokenizer(
                dataset["essay_text"],
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )
        tokenized_text["label"] = dataset["label"]
        return Dataset.from_dict(tokenized_text)

    return tokenize_essays(dataset["full"], tokenizer, MAX_LENGTH)
tokenized_dataset = prepare_dataset(dataset)

In [6]:
split_data = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_data["train"].with_format("torch")
val_dataset = split_data["test"].with_format("torch")
dataset_train = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True
)
dataset_val = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
len(dataset_train)*BATCH_SIZE, len(dataset_val)*BATCH_SIZE

(2912, 336)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        cache_dir="/tmp/", 
        num_labels=6,
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def predict_classes(output):
    if OBJECTIVE == "regression":
        # Round the tensor to the nearest integer
        rounded_tensor = torch.round(output.logits)
        # Clamp the values to the range [0, 5]
        clamped_tensor = torch.clamp(rounded_tensor, min=0, max=5)
        return clamped_tensor.view(-1)
    elif OBJECTIVE == "classification":
        return torch.argmax(output.logits, axis=1)
    elif OBJECTIVE == "ordinal":
        return corn_label_from_logits(output.logits)
        
def get_predictions_and_labels(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    all_predictions = []
    all_true_labels = []
    i=0
    for batch in tqdm(dataloader, desc="Obtaining predictions"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            predicted_classes = predict_classes(output) 

        # If using GPU, need to move the data back to CPU to use numpy.
        all_predictions.extend(predicted_classes.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

    return list(map(lambda x: x * 40, all_predictions)), list(map(lambda x: x * 40, all_true_labels))

all_predictions, all_true_labels = get_predictions_and_labels(model, dataset_val)

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

In [9]:
def enem_accuracy_score(true_values, predicted_values):
    assert len(true_values) == len(predicted_values), "Mismatched length between true and predicted values."

    non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 80])
    
    return non_divergent_count / len(true_values)

accuracy = accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation  set: {accuracy:.2f}")

Accuracy on the validation  set: 0.01


In [10]:
qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
print(f"QWK on the validation set: {qwk:.2f}")

QWK on the validation set: 0.00


In [11]:
enem_accuracy = enem_accuracy_score(all_true_labels, all_predictions)
print(f"Accuracy on the validation set: {enem_accuracy:.2f}")

Accuracy on the validation set: 0.02


In [12]:
class LightningGradePredictor(LightningModule):
    def __init__(self, model: nn.Module, loss_function, learning_rate=0.001, num_warmup_steps=None, num_training_steps=None):
        super(LightningGradePredictor, self).__init__()
        self.model = model
        self.loss = loss_function
        self.learning_rate = learning_rate
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
        self.save_hyperparameters(ignore=['model'])
        self.training_step_loss = []
        self.validation_step_loss = []

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        loss = None
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits, labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('train_loss', loss)
        self.training_step_loss.append(loss)
        optimizer = self.optimizers()
        if isinstance(optimizer, list):
            optimizer = optimizer[0]
        lr_current = optimizer.optimizer.param_groups[0]['lr']
        self.log('current_lr', lr_current)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits, labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('val_loss', loss)
        self.validation_step_loss.append(loss)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = self(input_ids, attention_mask)
        if OBJECTIVE == "regression":
            loss = self.loss(outputs.logits,  labels.unsqueeze(1).float())
        elif OBJECTIVE == "classification":
            loss = self.loss(outputs.logits, labels)
        elif OBJECTIVE == "ordinal":
            loss = self.loss(outputs.logits, labels, num_classes=self.model.num_labels)
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate, weight_decay=WEIGHT_DECAY)

        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.num_warmup_steps,
            num_training_steps=self.num_training_steps
        )

        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]


In [13]:
class EpochEndCallback(Callback):
    def __init__(self):
        self.metrics_df = pd.DataFrame(columns=
                                       ['Epoch', 'Train Loss', 'Validation Loss',
                                         'Train QWK', 'Validation QWK',
                                         'Train RMSE', 'Validation RMSE'
                                        ])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def on_train_epoch_end(self, trainer, pl_module):
        current_epoch = trainer.current_epoch
        # Metrics for training data
        train_dataloader = trainer.train_dataloader
        val_dataloader = trainer.val_dataloaders
        
        epoch_train_loss_mean = torch.stack(pl_module.training_step_loss).mean()
        epoch_val_loss_mean = torch.stack(pl_module.validation_step_loss).mean()
        pl_module.training_step_loss.clear()
        pl_module.validation_step_loss.clear()
        model.eval()
        train_predictions, train_true_labels = get_predictions_and_labels(model, train_dataloader)
        val_predictions, val_true_labels = get_predictions_and_labels(model, val_dataloader)
        train_qwk = cohen_kappa_score(train_true_labels, train_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
        val_qwk = cohen_kappa_score(val_true_labels, val_predictions, weights="quadratic", labels=[0,40,80,120,160,200])
        pl_module.log('val_qwk', val_qwk)

        train_rmse = mean_squared_error(train_true_labels, train_predictions)
        val_rmse = mean_squared_error(val_true_labels, val_predictions)
        train_rmse = np.sqrt(train_rmse)
        val_rmse = np.sqrt(val_rmse)
        pl_module.log('val_rmse', val_rmse)

        new_row = {'Epoch': current_epoch, 
                    'Train Loss': epoch_train_loss_mean.cpu().detach().numpy(),
                    'Validation Loss': epoch_val_loss_mean.cpu().detach().numpy(),
                    'Train QWK': train_qwk,
                    'Validation QWK': val_qwk,
                    'Train RMSE': train_rmse,
                    'Validation RMSE': val_rmse
                    }
        new_row = pd.Series(new_row).to_frame().T
        self.metrics_df = pd.concat([self.metrics_df, new_row])
        display(self.metrics_df)

In [14]:
early_stop_callback = EarlyStopping(monitor="val_qwk", patience=3, verbose=True, mode="max")
checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_qwk", mode="max")

In [15]:
def get_loss():
    if OBJECTIVE == "regression":
        return torch.nn.functional.mse_loss
    elif OBJECTIVE == "classification":
        return torch.nn.functional.cross_entropy
    elif OBJECTIVE == "ordinal":
        return corn_loss
       
loss_function = get_loss() 
NUM_EPOCHS = 20
num_training_steps = NUM_EPOCHS * len(dataset_train)
warmup_steps = int(num_training_steps * 0.1)
steps_per_epoch = num_training_steps//NUM_EPOCHS
model.train()
trainer = Trainer(
    max_epochs=NUM_EPOCHS, 
    log_every_n_steps=steps_per_epoch, 
    logger=CSVLogger("model_logs", name=EXPERIMENT_NAME), 
    callbacks=[EpochEndCallback(), early_stop_callback, checkpoint_callback],
    precision=PRECISION,
    accumulate_grad_batches=GRADIENT_ACC
    )
ligthning_model = LightningGradePredictor(model, 
                    loss_function=loss_function,
                    num_training_steps=num_training_steps,
                    num_warmup_steps=warmup_steps,
                    learning_rate = LEARNING_RATE
                    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [16]:
trainer.fit(model=ligthning_model, train_dataloaders=dataset_train, val_dataloaders=dataset_val)

Missing logger folder: model_logs/aes_enem_models-sourceB-ordinal-base-C4
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.711   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/andrebarbosa/miniconda3/envs/aes_enem/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.
/home/andrebarbosa/miniconda3/envs/aes_enem/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709


Metric val_qwk improved. New best score: 0.623


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473


Metric val_qwk improved by 0.033 >= min_delta = 0.0. New best score: 0.657


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563


Metric val_qwk improved by 0.017 >= min_delta = 0.0. New best score: 0.674


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563
0,3,0.12960032,0.17362837,0.80038,0.657236,21.970577,28.72011


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563
0,3,0.12960032,0.17362837,0.80038,0.657236,21.970577,28.72011
0,4,0.091539524,0.19856247,0.933497,0.698094,13.581851,27.391797


Metric val_qwk improved by 0.025 >= min_delta = 0.0. New best score: 0.698


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563
0,3,0.12960032,0.17362837,0.80038,0.657236,21.970577,28.72011
0,4,0.091539524,0.19856247,0.933497,0.698094,13.581851,27.391797
0,5,0.06311048,0.24728142,0.909902,0.604642,15.230368,30.401046


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563
0,3,0.12960032,0.17362837,0.80038,0.657236,21.970577,28.72011
0,4,0.091539524,0.19856247,0.933497,0.698094,13.581851,27.391797
0,5,0.06311048,0.24728142,0.909902,0.604642,15.230368,30.401046
0,6,0.04823941,0.2575669,0.966603,0.640834,9.859209,30.887494


Validation: |          | 0/? [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/182 [00:00<?, ?it/s]

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Epoch,Train Loss,Validation Loss,Train QWK,Validation QWK,Train RMSE,Validation RMSE
0,0,0.3177377,0.23218523,0.656546,0.623229,29.5122,31.28709
0,1,0.17231636,0.17161712,0.683152,0.656725,27.677316,28.633473
0,2,0.15689544,0.18670237,0.743683,0.673583,24.67042,27.662563
0,3,0.12960032,0.17362837,0.80038,0.657236,21.970577,28.72011
0,4,0.091539524,0.19856247,0.933497,0.698094,13.581851,27.391797
0,5,0.06311048,0.24728142,0.909902,0.604642,15.230368,30.401046
0,6,0.04823941,0.2575669,0.966603,0.640834,9.859209,30.887494
0,7,0.026349239,0.26012075,0.975605,0.591611,8.174824,30.7262


Monitored metric val_qwk did not improve in the last 3 records. Best score: 0.698. Signaling Trainer to stop.


In [17]:
best_model = LightningGradePredictor.load_from_checkpoint(checkpoint_callback.best_model_path, 
                                                  model=ligthning_model.model, 
                                                  loss_function=ligthning_model.loss)
best_model.model.eval()
all_predictions, all_true_labels = get_predictions_and_labels(best_model.model, dataset_val)
rmse_val = mean_squared_error(all_true_labels, all_predictions, squared=False)
print(f"RMSE on the validation set: {rmse_val: .4f}")

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

RMSE on the validation set:  27.3918


In [18]:
def compute_metrics(model, dataset, test_group):
    all_predictions, all_true_labels = get_predictions_and_labels(model, dataset)
    accuracy = accuracy_score(all_true_labels, all_predictions)
    qwk = cohen_kappa_score(all_true_labels, all_predictions, weights="quadratic", labels=[0,40,80,120,160,200]) 
    rmse = mean_squared_error(all_true_labels, all_predictions, squared=False)
    horizontal_discrepancy = enem_accuracy_score(all_true_labels, all_predictions)
    result = {
        'Experiment Reference': EXPERIMENT_NAME,
        'Test Group': test_group,
        'Competence': REFERENCE_CONCEPT,
        'Accuracy': [accuracy],
        'RMSE': [rmse],
        'QWK': [qwk],
        'HDIV': [1- horizontal_discrepancy]
    }
    return pd.DataFrame(result)

In [19]:
compute_metrics(model, dataset_val, "validation")

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,aes_enem_models-sourceB-ordinal-base-C4,validation,3,0.673913,27.391797,0.698094,0.006211


In [20]:
display(checkpoint_callback.best_model_path)
display(checkpoint_callback.best_model_score)

'model_logs/aes_enem_models-sourceB-ordinal-base-C4/version_0/checkpoints/epoch=4-step=910.ckpt'

tensor(0.6981, device='cuda:0')

In [21]:
best_model = LightningGradePredictor.load_from_checkpoint(checkpoint_callback.best_model_path,
                                                  model=ligthning_model.model, 
                                                  )

In [22]:
compute_metrics(model, dataset_val, "validation")

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,aes_enem_models-sourceB-ordinal-base-C4,validation,3,0.673913,27.391797,0.698094,0.006211


In [23]:
bert_save_path = f"model_checkpoints/hugging_face/pretraining/{EXPERIMENT_NAME}"
bert_save_path

'model_checkpoints/hugging_face/pretraining/aes_enem_models-sourceB-ordinal-base-C4'

In [24]:
best_model.model.save_pretrained(bert_save_path)

Validation

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
            bert_save_path, 
            cache_dir="/tmp/", 
            num_labels=6,
        )
compute_metrics(model, dataset_val, "validation")

Obtaining predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,Experiment Reference,Test Group,Competence,Accuracy,RMSE,QWK,HDIV
0,aes_enem_models-sourceB-ordinal-base-C4,validation,3,0.673913,27.391797,0.698094,0.006211
