In [1]:
# load in packages

import pytorch_lightning as pl
import torch
import numpy as np

import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset

from models.transformer_clf import Transformer_CLF
from data.meta_dataset import MetaDataset

import os


In [2]:
config = {'include':['go_emotions'],
         'encoder_name':'bert-base-uncased',
         'nu':-1,
         'hidden_dims':[256, 128],
         'act_fn':'Tanh',
         'lr':1e-5,
         'batch_size':8,
         'max_epochs':1,
         'version':'go_emotions_test',
         'checkpoint_path':'./checkpoints/baselines',
         'gpu': True}

device = 'cuda' if (torch.cuda.is_available() and config['gpu']) else 'cpu'

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, device, cutoff=100):
        self.samples = []
        
        for label in sorted(dataset.keys()):
            for i, point in enumerate(dataset[label]):
                tokenized_input = tokenizer(point['text'],
                                    return_tensors='pt',
                                    padding='max_length',
                                    truncation=True).to(device)

                self.samples.append((tokenized_input['input_ids'].squeeze(), tokenized_input['attention_mask'].squeeze(),
                          label))
            #if i > cutoff:
                #break

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [4]:
def extract_dataloaders(dataset, tokenizer, device, batch_size=8, extract='go_emotions', shuffle=True, num_workers=0):
    data_splits = {}
    for split in dataset[extract].keys():
        data_split = CustomDataset(dataset[extract][split], tokenizer, device)
        data_split_loader = torch.utils.data.DataLoader(data_split, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers)
        data_splits[split] = data_split_loader
        
    return data_splits

In [5]:
class CLFTrainer(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        
        self.config = config
        # Create model        
        self.model = Transformer_CLF(config)
        # # Create loss module
        self.loss_module = nn.CrossEntropyLoss()


    def forward(self, text, attn_mask):
        return self.model(text, attn_mask)

        
    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(), self.config["lr"])
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99)
        return [optimizer], [scheduler]


    def encode(self, text, attn_mask=None):
        return self.model.encode(text, attn_mask)
    
    def training_step(self, batch, batch_idx):
        # "batch" is the output of the train data loader.
        text, attn_mask, labels = batch
        preds = self.model(text, attn_mask)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        
        self.log('train_acc', acc, on_step=False, on_epoch=True) # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log('train_loss', loss)
        return loss # Return tensor to call ".backward" on


    def validation_step(self, batch, batch_idx):
        text, attn_mask, labels = batch
        
        preds = self.model(text, attn_mask).argmax(dim=-1)
        acc = (labels == preds).float().mean()

        self.log('val_acc', acc) # By default logs it per epoch (weighted average over batches)


    def test_step(self, batch, batch_idx):
        text, attn_mask, labels = batch
        preds = self.model(text, attn_mask).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        self.log('test_acc', acc) # By default logs it per epoch (weighted average over batches), and returns it afterwards


In [None]:
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
import torch.nn as nn

def train_model(config):
    """
    Function for training and testing a NLI model.
    Inputs:
        config - Namespace object from the argument parser
    """
    
    device = 'cuda' if (torch.cuda.is_available() and config['gpu']) else 'cpu'

    tokenizer = AutoTokenizer.from_pretrained(config['encoder_name'])
    
    print("Extracting datasets")
    # ToDo: process data and make sure it uses same amount of training data as protomaml
    dataset = MetaDataset(include=config['include'])
    tokenizer_kwargs = {'return_tensors':'pt',
                                    'padding':'max_length',
                                    'truncation':True}
    
    dataset.prep(tokenizer)
    
    print("creating dataloaders")
    data_loaders = extract_dataloaders(dataset, tokenizer, device, config['batch_size'])
    train_loader = data_loaders['train']
    validation_loader = data_loaders['validation']
    test_loader = data_loaders['test']
    
    # ToDo: add n_classes
    config["n_classes"] = 27

    print('creating trainer')
    checkpoint_callback = ModelCheckpoint(dirpath=config['checkpoint_path'], save_weights_only=True, mode="max", monitor="val_acc")
    trainer = pl.Trainer(default_root_dir=os.path.join(config['checkpoint_path'], config['version']),                                
                         checkpoint_callback=checkpoint_callback, 
                         gpus=1 if str(device)=="cuda" else 0,                                                     
                         max_epochs=config['max_epochs'],                                                                           
                         progress_bar_refresh_rate=1
                         )                                                                  
    trainer.logger._log_graph = False      
    trainer.logger._default_hp_metric = None

    pl.seed_everything(1234) 
    
    model = CLFTrainer(config)
    trainer.fit(model, train_loader, validation_loader)
    
    model = CLFTrainer.load_from_checkpoint(checkpoint_callback.best_model_path)
    test_result = trainer.test(model, test_dataloaders=test_loader, verbose=False)
    
    return model

train_model(config)

Extracting datasets


No config specified, defaulting to: go_emotions/simplified
Reusing dataset go_emotions (C:\Users\luuk1\.cache\huggingface\datasets\go_emotions\simplified\0.0.0\b781b3f96f1b333b895ded30861c0d4a07d66e1cfbdfb89bc3fb4d5fc899aa27)


HBox(children=(FloatProgress(value=0.0, max=44.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


creating dataloaders


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Global seed set to 1234


creating trainer


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | Transformer_CLF  | 109 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
85.9 M    Trainable params
23.8 M    Non-trainable params
109 M     Total params
438.862   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 1234


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…