In [4]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, AutoConfig, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import lightning.pytorch as pl
from torch.utils.data import DataLoader

from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping
from lightning.pytorch.loggers import CSVLogger

from sklearn.metrics import f1_score, accuracy_score



SEED=1234542

pl.seed_everything(SEED, workers=True)

df_train=pd.read_csv('../../data/splitted/train.csv')
df_validation=pd.read_csv('../../data/splitted/validation.csv')
df_test=pd.read_csv('../../data/splitted/test.csv')

dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)
dataset['validation'] = Dataset.from_pandas(df_validation)
dataset['test'] = Dataset.from_pandas(df_test)

NUM_CLASSES= len(df_train['labels'].unique())
TEXT_USED='text_no_cap'
MAX_LENGTH=4096

Global seed set to 1234542


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#MODEL_NAME = 'microsoft/deberta-v3-base' # 512 seq length
MODEL_NAME = 'allenai/longformer-base-4096' # 4096 seq length
# MODEL_NAME = 'mnaylor/mega-base-wikitext' # 2048 seq length
#MODEL_NAME='microsoft/deberta-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config= AutoConfig.from_pretrained(MODEL_NAME)
pretrained_model = AutoModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def tokenize(batch):
    tokens = tokenizer(batch[TEXT_USED], truncation=True, max_length=MAX_LENGTH)
    batch['input_ids'], batch['attention_mask'] = tokens['input_ids'], tokens['attention_mask']
    return batch

dataset = dataset.map(tokenize)

dataset['train'] = dataset['train'].remove_columns(['headline', 'abstract', 'caption', 'image_url', 'article_url', 'image_id', 'body', 'full_text', 'text_no_cap', 'labels_text'])
dataset['validation'] = dataset['validation'].remove_columns(['headline', 'abstract', 'caption', 'image_url', 'article_url', 'image_id', 'body', 'full_text', 'text_no_cap', 'labels_text'])
dataset['test'] = dataset['test'].remove_columns(['headline', 'abstract', 'caption', 'image_url', 'article_url', 'image_id', 'body', 'full_text', 'text_no_cap', 'labels_text'])

100%|##########| 48180/48180 [02:46<00:00, 289.52ex/s]
100%|##########| 6022/6022 [00:20<00:00, 294.13ex/s]
100%|##########| 6023/6023 [00:20<00:00, 295.33ex/s]


In [14]:
BATCH_SIZE = 4

data_collator = DataCollatorWithPadding(tokenizer)
train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True)
validation_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE, collate_fn=data_collator)
test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE, collate_fn=data_collator)

In [9]:
for batch in train_loader:
    print(batch['input_ids'])
    print(batch['input_ids'].shape)
    break

tensor([[    0, 35193,   622,  ...,    31,   427,     2],
        [    0,   597,  8831,  ..., 15705,  2380,     2],
        [    0, 30913,  1534,  ...,     6,    38,     2],
        ...,
        [    0,   250, 41802,  ...,     1,     1,     1],
        [    0,  3750,  7378,  ...,     1,     1,     1],
        [    0,  2264,    18,  ...,     1,     1,     1]])
torch.Size([8, 512])


In [15]:
class TextClassifier(pl.LightningModule):
    def __init__(self, model=pretrained_model,  lr_transformer=2e-5, lr_head=2e-3):
        super(TextClassifier, self).__init__()
        self.criterion = nn.CrossEntropyLoss()
        self.lr_transformer=lr_transformer
        self.lr_head=lr_head
        
        # En el train hacemos media de medias
        self.train_loss=[]
        self.train_accs=[]
        self.train_f1s=[]
        
        
        # Aqui computamos las métricas con todo para mayor precision   
        self.val_loss=[]             
        self.all_val_y_true=[]
        self.all_val_y_pred=[]
        
        self.model = model
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.fc1 = nn.Linear(config.hidden_size, 512)
        # self.fc1 = nn.Linear(config.hidden_size, 64) # Mega
        self.activation1 = nn.GELU()
        self.dropout = nn.Dropout(p=0.5)
        self.output = nn.Linear(512, NUM_CLASSES)
        # self.output = nn.Linear(64, NUM_CLASSES) # Mega
        
    def compute_outputs(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # logits = outputs['last_hidden_state'][:, 0]  #Get the CLS tokens (deberta)
        logits = outputs.pooler_output
        x = self.layer_norm(logits)
        x = self.activation1(self.fc1(x))
        x=self.dropout(x)
        return self.output(x)
    
    def forward(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        x = self.compute_outputs(input_ids, attention_mask)
        return x
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        #Compute the output logits
        logits = self.compute_outputs(input_ids, attention_mask)
        #Compute metrics
        loss=self.criterion(logits,labels)
        preds = torch.argmax(logits, dim=-1)
        acc=accuracy_score(y_true=labels.tolist(), y_pred=preds.tolist())
        f1=f1_score(y_true=labels.tolist(), y_pred=preds.tolist(), average='macro')
        self.train_loss.append(loss)
        self.train_accs.append(acc)
        self.train_f1s.append(f1)
        
        return loss
    
    def on_train_epoch_end(self):
        # outs is a list of whatever you returned in `validation_step`
        mean_loss = sum(self.train_loss)/len(self.train_loss)
        mean_acc=sum(self.train_accs)/len(self.train_accs)
        mean_f1=sum(self.train_f1s)/len(self.train_f1s)
        
        self.log("train_loss", mean_loss)
        self.log("train_acc", mean_acc)
        self.log("train_f1", mean_f1)
        
        self.train_loss=[]
        self.train_accs=[]
        self.train_f1s=[]
    
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        #Compute the output logits
        logits = self.compute_outputs(input_ids, attention_mask)
        #Compute metrics
        loss=self.criterion(logits,labels)
        preds = torch.argmax(logits, dim=-1)
        
        self.val_loss.append(loss)
        
        self.all_val_y_true.extend(labels.tolist())
        self.all_val_y_pred.extend(preds.tolist())
        return loss
    
    def on_validation_epoch_end(self):
        # outs is a list of whatever you returned in `validation_step`
        mean_loss = sum(self.val_loss)/len(self.val_loss)
        
        acc= accuracy_score(y_true=self.all_val_y_true, y_pred=self.all_val_y_pred)
        f1= f1_score(y_true=self.all_val_y_true, y_pred=self.all_val_y_pred, average='macro')
        
        self.log("val_loss", mean_loss)
        self.log("val_acc", acc)
        self.log("val_f1", f1)
        
        self.val_loss=[]
        self.all_val_y_true=[]
        self.all_val_y_pred=[]
    
    def configure_optimizers(self):
        optimizer = optim.AdamW([
            {'params': self.model.parameters(), 'lr': self.lr_transformer,'amsgrad':True, 'weight_decay':0.01 },
            {'params': self.layer_norm.parameters()},
            {'params': self.fc1.parameters()},
            {'params': self.output.parameters()},
        ],lr=self.lr_head, amsgrad=True, weight_decay=0.01)
        
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.1, patience=5)
        return {
                "optimizer": optimizer,
                "lr_scheduler": {
                    "scheduler": scheduler,
                    "monitor": "val_loss",
                    },
                }

In [16]:
experiment_name=f'{MODEL_NAME}_{TEXT_USED}_{MAX_LENGTH}+Reg'
# Define the callbacks
checkpoint_callback = ModelCheckpoint(
     dirpath='../../model_ckpts/Unimodal/Text',
     filename=experiment_name,
     monitor='val_f1', mode='max')
lr_monitor = LearningRateMonitor(logging_interval='epoch')
early_stopping = EarlyStopping('val_f1', patience=7,mode='max')

# instantiate the logger object
logger = CSVLogger(save_dir="../../logs/Unimodal/Text", name=experiment_name)
 

my_model=TextClassifier(pretrained_model)
trainer=pl.Trainer(accelerator="gpu", devices=[0], deterministic=True, max_epochs=20, logger=logger, precision='16-mixed', accumulate_grad_batches=8,
                   callbacks=[lr_monitor, early_stopping, checkpoint_callback])
trainer.fit(model=my_model,train_dataloaders=train_loader, val_dataloaders=validation_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name        | Type             | Params
-------------------------------------------------
0 | criterion   | CrossEntropyLoss | 0     
1 | model       | LongformerModel  | 148 M 
2 | layer_norm  | LayerNorm        | 1.5 K 
3 | fc1         | Linear           | 393 K 
4 | activation1 | GELU             | 0     
5 | dropout     | Dropout          | 0     
6 | output      | Linear         

                                                                           

  rank_zero_warn(


Epoch 0:   0%|          | 39/12045 [00:33<2:52:55,  1.16it/s, v_num=0]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


# PREDICTIONS

In [25]:
train_loader_predicting = DataLoader(dataset['train'], batch_size=BATCH_SIZE, collate_fn=data_collator)

In [26]:
# model = TextClassifier.load_from_checkpoint("../../model_ckpts/Unimodal/Text/allenai/longformer-base-4096_text_no_cap-v1.ckpt") 
# model = TextClassifier.load_from_checkpoint("../../model_ckpts/Unimodal/Text/microsoft/deberta-v3-base_text_no_cap.ckpt") 
model = TextClassifier.load_from_checkpoint("../../model_ckpts/Unimodal/Text/mnaylor/mega-base-wikitext_text_no_cap.ckpt") 

In [27]:
trainer=pl.Trainer(accelerator="gpu", devices=[1], deterministic=True, max_epochs=25, precision=16, accumulate_grad_batches=8)
predictions_test = trainer.predict(model, test_loader)
predictions_val = trainer.predict(model, validation_loader)
predictions_train = trainer.predict(model, train_loader_predicting)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Predicting DataLoader 0: 100%|##########| 753/753 [00:18<00:00, 39.84it/s]


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting DataLoader 0: 100%|##########| 753/753 [00:18<00:00, 41.26it/s]


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting DataLoader 0: 100%|##########| 6023/6023 [02:20<00:00, 42.90it/s]


In [32]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import seaborn as sns
import pickle

PICKLE_PATH= '../../Pickles'

def get_predictions(loader, predictions, split):
    # initialize the variables for storing true and predicted labels
    all_y_true = []
    all_y_pred = []

    # iterate over the batches and compute f1-score and confusion matrix for each batch
    for i, batch in enumerate(loader):
        preds=torch.argmax(predictions[i], dim=-1)
        y_pred, y_true = preds.tolist(), batch['labels'].tolist()

        # append the true and predicted labels to the corresponding lists
        all_y_true.extend(y_true)
        all_y_pred.extend(y_pred)

    # compute the confusion matrix
    cm = confusion_matrix(y_true=all_y_true, y_pred=all_y_pred)
    
    with open(f'y_true_{split}.pkl', 'wb') as f1:
      pickle.dump(all_y_true, f1)
      
    with open(f'y_pred_{split}.pkl', 'wb') as f2:
      pickle.dump(all_y_pred, f2)
      
    mean_f1= f1_score(y_true=all_y_true, y_pred=all_y_pred, average='macro')
    print(f'Mean F1: {mean_f1}')

#get_predictions(train_loader, predictions_train, split='train')

In [29]:
get_predictions(train_loader_predicting, predictions_train, split='train')

Mean F1: 0.8647250682555363


In [30]:
get_predictions(validation_loader, predictions_val, split='val')

Mean F1: 0.8161034072402145


In [31]:
get_predictions(test_loader, predictions_test, split='test')

Mean F1: 0.807400918652049
