In [6]:
# load in packages

import pytorch_lightning as pl
import torch
import numpy as np

import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset

from models.transformer_clf import Transformer_CLF
from data.meta_dataset import MetaDataset

import os


In [7]:
config = {'include':['go_emotions'],
         'encoder_name':'bert-base-uncased',
         'nu':-1,
         'hidden_dims':[256, 128],
         'act_fn':'Tanh',
         'lr':1e-5,
         'batch_size':8,
         'max_epochs':999,
         'version':'go_emotions_test',
         'checkpoint_path':'./checkpoints/baselines',
         'gpu':False}

device = 'cuda' if (torch.cuda.is_available() and config['gpu']) else 'cpu'

In [12]:
class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, device, cutoff=100):
        self.samples = []
        
        for label in sorted(dataset.keys()):
            for i, point in enumerate(dataset[label]):
                tokenized_input = tokenizer(point['text'],
                                    return_tensors='pt',
                                    padding='max_length',
                                    truncation=True).to(device)


                self.samples.append((tokenized_input['input_ids'].squeeze(), tokenized_input['attention_mask'].squeeze(),
                          label))
            if i > cutoff:
                break

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# foo_train = CustomDataset(dataset['go_emotions']['train'], tokenizer, device)  

# def extract_single_dataset(dataset, tokenizer, device, extract='go_emotions', shuffle=False):
#     data_splits = {}
    
#     for split in dataset[extract].keys():
#         dataset_split = dataset[extract][split]
# #         data_split = []
# #         input_ids, attention_mask, labels = [], [], []
#         x, y = [], []
#         for label in sorted(dataset_split.keys()):
#             for point in dataset_split[label]:
#                 tokenized_input = tokenizer(point['text'],
#                                     return_tensors='pt',
#                                     padding=True).to(device)
                
                
#                 x.append([tokenized_input['input_ids'], tokenized_input['attention_mask'],
#                           torch.LongTensor(int(label))])
        
                
                

#         print(x)
#         data_splits[split] = x
        
#         print(data_splits[split])
    
    
#     return data_splits


# go_emotions_dataset = extract_single_dataset(dataset, tokenizer, device)


In [13]:
def extract_dataloaders(dataset, tokenizer, device, batch_size=8, extract='go_emotions', shuffle=True):
    data_splits = {}
    for split in dataset[extract].keys():
        data_split = CustomDataset(dataset[extract][split], tokenizer, device)
        data_split_loader = torch.utils.data.DataLoader(data_split, shuffle=shuffle, batch_size=batch_size)
        data_splits[split] = data_split_loader
        
    return data_splits

In [14]:
class CLFTrainer(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        
        self.config = config
        # Create model        
        self.model = Transformer_CLF(config)
        # # Create loss module
        self.loss_module = nn.CrossEntropyLoss()


    def forward(self, text, attn_mask):
        return self.model(text, attn_mask)

        
    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(), self.config["lr"])
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99)
        return [optimizer], [scheduler]


    def encode(self, text, attn_mask=None):
        return self.model.encode(text, attn_mask)
    
    def training_step(self, batch, batch_idx):
        # "batch" is the output of the train data loader.
        print(batch, batch_idx)
        text, attn_mask, labels = batch
        preds = self.model(text, attn_mask)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        
        self.log('train_acc', acc, on_step=False, on_epoch=True) # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log('train_loss', loss)
        return loss # Return tensor to call ".backward" on


    def validation_step(self, batch, batch_idx):
        print(batch, batch_idx)
        
        text, attn_mask, labels = batch
        
        print(text, attn_mask, labels)
        print(text.shape, attn_mask.shape, labels.shape)
        preds = self.model(text, attn_mask).argmax(dim=-1)
        acc = (labels == preds).float().mean()

        self.log('val_acc', acc) # By default logs it per epoch (weighted average over batches)


    def test_step(self, batch, batch_idx):
        print(batch, batch_idx)
        text, attn_mask, labels = batch
        preds = self.model(text, attn_mask).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        self.log('test_acc', acc) # By default logs it per epoch (weighted average over batches), and returns it afterwards


In [15]:
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
import torch.nn as nn

def train_model(config):
    """
    Function for training and testing a NLI model.
    Inputs:
        config - Namespace object from the argument parser
    """
    
    device = 'cuda' if (torch.cuda.is_available() and config['gpu']) else 'cpu'

    tokenizer = AutoTokenizer.from_pretrained(config['encoder_name'])
    
    print("Extracting datasets")
    # ToDo: process data and make sure it uses same amount of training data as protomaml
    dataset = MetaDataset(include=config['include'])
    tokenizer_kwargs = {'return_tensors':'pt',
                                    'padding':'max_length',
                                    'truncation':True}
    
    dataset.prep(tokenizer)
    
    print("creating dataloaders")
    data_loaders = extract_dataloaders(dataset, tokenizer, device)
    train_loader = data_loaders['train']
    validation_loader = data_loaders['validation']
    test_loader = data_loaders['test']
    
    # ToDo: add n_classes
    config["n_classes"] = 27

    print('creating trainer')

    trainer = pl.Trainer(default_root_dir=os.path.join(config['checkpoint_path'], config['version']),                                  # Where to save models
                         checkpoint_callback=ModelCheckpoint(save_weights_only=True, 
                                                             mode="max", monitor="val_acc"), # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
                         gpus=1 if str(device)=="cuda" else 0,                                                     # We run on a single GPU (if possible)
                         max_epochs=config['max_epochs'],                                                                             # How many epochs to train for if no patience is set
#                          callbacks=[LearningRateMonitor("epoch")],                                                   # Log learning rate every epoch
                         progress_bar_refresh_rate=100
                         )                                                                   # In case your notebook crashes due to the progress bar, consider increasing the refresh rate
    trainer.logger._log_graph = True         # If True, we plot the computation graph in tensorboard
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    pl.seed_everything(1234) # To be reproducable
    
    model = CLFTrainer(config)
    trainer.fit(model, train_loader, validation_loader)
    
    model = CLFTrainer.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
    val_result = trainer.test(model, test_dataloaders=validation_loader, verbose=False)
    test_result = trainer.test(model, test_dataloaders=test_loader, verbose=False)
    
    return model

train_model(config)

Extracting datasets


No config specified, defaulting to: go_emotions/simplified
Reusing dataset go_emotions (/home/bart/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/ef1c18ea192c771555f1e0d638889dd5f1896255782c57c6a0b934d5f94f779e)
Loading cached processed dataset at /home/bart/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/ef1c18ea192c771555f1e0d638889dd5f1896255782c57c6a0b934d5f94f779e/cache-52d24b382185989d.arrow
Loading cached processed dataset at /home/bart/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/ef1c18ea192c771555f1e0d638889dd5f1896255782c57c6a0b934d5f94f779e/cache-ed2d20be2890b5ac.arrow
Loading cached processed dataset at /home/bart/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/ef1c18ea192c771555f1e0d638889dd5f1896255782c57c6a0b934d5f94f779e/cache-7610501b2ccd2f6e.arrow


creating dataloaders


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 1234


creating trainer



  | Name        | Type             | Params
-------------------------------------------------
0 | model       | Transformer_CLF  | 109 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
85.9 M    Trainable params
23.8 M    Non-trainable params
109 M     Total params
438.862   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][tensor([[ 101, 1996, 3376,  ...,    0,    0,    0],
        [ 101, 6429, 2466,  ...,    0,    0,    0],
        [ 101, 2821, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 1996, 3658,  ...,    0,    0,    0],
        [ 101, 1026, 2171,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 0, 0, 0, 0])] 0
tensor([[ 101, 1996, 3376,  ...,    0,    0,    0],
        [ 101, 6429, 2466,  ...,    0,    0,    0],
        [ 101, 2821, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 1996, 3658,  ...,    0,    0,    0],
        [ 101, 1026, 2171,  ...,    0,    0,    0]]) tensor([[1, 1, 1,  ..., 0, 0,



TypeError: forward() takes 2 positional arguments but 3 were given