In [None]:
# default_exp lightning

In [None]:
# export
import logging
logging.disable(logging.CRITICAL)
import os
import random
import torch
import pytorch_lightning as pl
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from test_tube import HyperOptArgumentParser
from emotion_transformer.dataloader import dataloader
from emotion_transformer.model import sentence_embeds_model, context_classifier_model, metrics, f1_score

# PyTorch Lightning

> construction of the PyTorch Lightning module and the hyperparameter search for the SemEval-2019 Task 3 dataset (contextual emotion detection in text)

## Lightning Module

Defining the Lightning module is now straightforward, see also the [documentation](https://williamfalcon.github.io/pytorch-lightning/). The default hyperparameter choices were motivated by [this paper](https://arxiv.org/pdf/1905.05583.pdf). 

Further references for PyTorch Lightning and its usage for Multi-GPU Training/Hyperparameter search can be found in the following blog posts by William Falcon: 

* [9 Tips For Training Lightning-Fast Neural Networks In Pytorch](https://towardsdatascience.com/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565)

* [Trivial Multi-Node Training With Pytorch-Lightning](https://towardsdatascience.com/trivial-multi-node-training-with-pytorch-lightning-ff75dfb809bd?gi=ec854edcc8eb)

* [Converting From Keras To PyTorch Lightning](https://towardsdatascience.com/converting-from-keras-to-pytorch-lightning-be40326d7b7d)

In [None]:
# export
class EmotionModel(pl.LightningModule):
    """
    PyTorch Lightning module for the Contextual Emotion Detection in Text Challenge
    """

    def __init__(self, hparams):
        """
        pass in parsed HyperOptArgumentParser to the model
        """
        super(EmotionModel, self).__init__()
        self.hparams = hparams
        self.emo_dict = {'others': 0, 'sad': 1, 'angry': 2, 'happy': 3}
        self.sentence_embeds_model = sentence_embeds_model(hparams.projection_size,
                                                           dropout = hparams.dropout)
        self.context_classifier_model = context_classifier_model(hparams.projection_size, 
                                                                 hparams.n_layers, 
                                                                 self.emo_dict, 
                                                                 dropout = hparams.dropout)
        

    def forward(self, input_ids, attention_mask, labels = None):
        """
        no special modification required for lightning, define as you normally would
        """
        if self.current_epoch < self.hparams.frozen_epochs:
            with torch.no_grad():
                print(self.current_epoch)
                sentence_embeds = self.sentence_embeds_model(input_ids = input_ids, attention_mask = attention_mask)
        else:
            sentence_embeds = self.sentence_embeds_model(input_ids = input_ids, attention_mask = attention_mask)
        return self.context_classifier_model(sentence_embeds = sentence_embeds, labels = labels)


    def training_step(self, batch, batch_idx):
        """
        Lightning calls this inside the training loop
        """
        input_ids, attention_mask, labels = batch
        loss, _ = self.forward(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
        if self.trainer.use_dp or self.trainer.use_ddp2:
            loss = loss.unsqueeze(0)
        
        print(self.sentence_embeds_model.projection.weight.grad)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    
    def validation_step(self, batch, batch_idx):
        """
        Lightning calls this inside the validation loop
        """
        input_ids, attention_mask, labels = batch

        loss, logits = self.forward(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        scores_dict = metrics(loss, logits, labels)

        # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
        if self.trainer.use_dp or self.trainer.use_ddp2:
            scores = [score.unsqueeze(0) for score in scores_dict.values()]
            scores_dict = {key: value for key, value in zip(scores_dict.keys(), scores)}

        return scores_dict

    def validation_end(self, outputs):
        """
        called at the end of validation to aggregate outputs
        :param outputs: list of individual outputs of each validation step
        :return:
        """
        
        tqdm_dict = {}

        for metric_name in outputs[0].keys():
            metric_total = 0

            for output in outputs:
                metric_value = output[metric_name]

                if self.trainer.use_dp or self.trainer.use_ddp2:
                    if metric_name in ['tp', 'fp', 'fn']:
                        metric_value = torch.sum(metric_value)
                    else:
                        metric_value = torch.mean(metric_value)
                    
                metric_total += metric_value
            if metric_name in ['tp', 'fp', 'fn']:
                tqdm_dict[metric_name] = metric_total
            else:
                tqdm_dict[metric_name] = metric_total / len(outputs)

               
        prec_rec_f1 = f1_score(tqdm_dict['tp'], tqdm_dict['fp'], tqdm_dict['fn'])
        tqdm_dict.update(prec_rec_f1) 
        result = {'progress_bar': tqdm_dict, 'log': tqdm_dict, 'val_loss': tqdm_dict["val_loss"]}
        return result
    
    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)
    
    def test_end(self, outputs):
        return self.validation_end(outputs)

    def configure_optimizers(self):
        """
        returns the optimizer and scheduler
        """
        opt_parameters = [{'params': self.context_classifier_model.parameters()}]
        #optimizer1 = torch.optim.AdamW(opt_parameters, lr=self.hparams.lr1)        
        opt_parameters += self.sentence_embeds_model.layerwise_lr(self.hparams.lr2, 
                                                                  self.hparams.layerwise_decay)        
        optimizer2 = torch.optim.AdamW(opt_parameters, lr=self.hparams.lr2)        
        #scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer1, T_max=10)
        scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer2, T_max=10)
        return [optimizer2], [scheduler2] #[optimizer1, optimizer2], [scheduler1, scheduler2]

    
    @pl.data_loader
    def train_dataloader(self):
        return dataloader(self.hparams.train_file, self.hparams.max_seq_len, 
                          self.hparams.bs, self.emo_dict, use_ddp = self.use_ddp)

    
    @pl.data_loader
    def val_dataloader(self):
        return dataloader(self.hparams.val_file, self.hparams.max_seq_len, 
                          self.hparams.bs, self.emo_dict, use_ddp = self.use_ddp)

    
    @pl.data_loader
    def test_dataloader(self):
        return dataloader(self.hparams.test_file, self.hparams.max_seq_len, 
                          self.hparams.bs, self.emo_dict, use_ddp = self.use_ddp)

    
    @staticmethod
    def add_model_specific_args(parent_parser, root_dir):  
        """
        parameters defined here will be available to the model through self.hparams
        """
        parser = HyperOptArgumentParser(parents=[parent_parser])

        parser.opt_list('--bs', default=32, type=int, options=[32, 128], tunable=True,
                        help='mini-batch size (default: 256), this is the total batch size of all GPUs'
                        'on the current node when using Data Parallel or Distributed Data Parallel')
        parser.opt_list('--projection_size', default=4, type=int, options=[64, 512], tunable=True)
        parser.opt_list('--n_layers', default=1, type=int, options=[1, 4], tunable=True)
        parser.opt_list('--frozen_epochs', default=1, type=int, options=[1, 4], tunable=True)
        parser.opt_range('--lr1', default=2.0e-5, type=float, tunable=True, low=1.0e-5, high=5.0e-4, 
                         nb_samples=5, help='initial learning rate for the top transformer')
        parser.opt_range('--lr2', default=2.0e-5, type=float, tunable=True, low=1.0e-5, high=5.0e-4, 
                         nb_samples=5, help='initial learning rate for both transformers')
        parser.opt_list('--layerwise_decay', default=0.95, type=float, options=[0.3, 0.8], tunable=True)
        parser.opt_list('--max_seq_len', default=3, type=int, options=[16, 64], tunable=False)
        parser.opt_list('--dropout', default=0.1, type=float, options=[0.1, 0.2], tunable=False)
        parser.add_argument('--train_file', default=os.path.join(root_dir, 'data/clean_test.txt'), type=str)
        parser.add_argument('--val_file', default=os.path.join(root_dir, 'data/clean_test.txt'), type=str)
        parser.add_argument('--test_file', default=os.path.join(root_dir, 'data/clean_test.txt'), type=str)
        parser.add_argument('--epochs', default=10, type=int, metavar='N',
                            help='number of total epochs to run')
        parser.add_argument('--seed', type=int, default=None,
                            help='seed for initializing training')
        
        return parser

## Hyperparameter Search Argument Parser

Next we define the HyperOptArgumentParser including distributed training (see also the [documentation](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/
)) and debugging functionality.

In [None]:
# export
def get_args(model):
    """
    returns the HyperOptArgumentParser
    """
    parent_parser = HyperOptArgumentParser(strategy='random_search', add_help = False)

    root_dir = os.getcwd()  
    parent_parser.add_argument('--mode', type=str, default='default', 
                               choices=('default', 'test', 'hparams_search'),
                               help='supports default for train/test/val and hparams_search for a hyperparameter search')
    parent_parser.add_argument('--save-path', metavar='DIR', default=os.path.join(root_dir, 'logs'), type=str,
                               help='path to save output')
    parent_parser.add_argument('--gpus', type=str, default=None, help='which gpus')
    parent_parser.add_argument('--distributed-backend', type=str, default=None, choices=('dp', 'ddp', 'ddp2'),
                               help='supports three options dp, ddp, ddp2')
    parent_parser.add_argument('--use_16bit', dest='use_16bit', action='store_true',
                               help='if true uses 16 bit precision')

    # debugging
    parent_parser.add_argument('--fast_dev_run', dest='fast_dev_run', action='store_true',
                               help='debugging a full train/val/test loop')
    parent_parser.add_argument('--track_grad_norm', dest='track_grad_norm', action='store_true',
                               help='inspect gradient norms')

    parser = model.add_model_specific_args(parent_parser, root_dir) 
    return parser

Let us take a look at the different attributes of `hparams`.

In [None]:
hparams = get_args(EmotionModel)
hparams = hparams.parse_args(args=[])
vars(hparams)

{'mode': 'default',
 'save_path': '/home/julius/Documents/nbdev_venv/emotion_transformer/logs',
 'gpus': None,
 'distributed_backend': None,
 'use_16bit': False,
 'fast_dev_run': False,
 'track_grad_norm': False,
 'bs': 32,
 'projection_size': 4,
 'n_layers': 1,
 'frozen_epochs': 1,
 'lr1': 2e-05,
 'lr2': 2e-05,
 'layerwise_decay': 0.95,
 'max_seq_len': 3,
 'dropout': 0.1,
 'train_file': '/home/julius/Documents/nbdev_venv/emotion_transformer/data/clean_test.txt',
 'val_file': '/home/julius/Documents/nbdev_venv/emotion_transformer/data/clean_test.txt',
 'test_file': '/home/julius/Documents/nbdev_venv/emotion_transformer/data/clean_test.txt',
 'epochs': 10,
 'seed': None,
 'hpc_exp_number': None,
 'trials': <bound method HyperOptArgumentParser.opt_trials of HyperOptArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.HelpFormatter'>, conflict_handler='error', add_help=True)>,
 'optimize_parallel': <bound method HyperOptArgumentParser

## Trainer
Next we define a function calling the Lightning trainer using the setting specified in `hparams`.

In [None]:
# export
def main(hparams, gpus = None):
    """
    Trains the Lightning model as specified in `hparams`
    """
    model = EmotionModel(hparams)
    
    if hparams.seed is not None:
        random.seed(hparams.seed)
        torch.manual_seed(hparams.seed)
        torch.backends.cudnn.deterministic = True
    
    
    trainer = pl.Trainer(default_save_path=hparams.save_path,
                        gpus=len(gpus.split(",")) if gpus else hparams.gpus,
                        distributed_backend=hparams.distributed_backend,
                        use_amp=hparams.use_16bit,
                        max_nb_epochs=hparams.epochs,
                        fast_dev_run=hparams.fast_dev_run,
                        track_grad_norm=(2 if hparams.track_grad_norm else -1))
    trainer.fit(model)
    
    if hparams.mode == 'test':
        trainer.test()

Let us check the model by running a quick development run.

In [None]:
#hparams.fast_dev_run = True
main(hparams)

Validation sanity check:   0%|          | 0/5 [00:00<?, ?batch/s]

0


Validation sanity check:  20%|██        | 1/5 [00:00<00:01,  3.92batch/s]

0


Validation sanity check:  60%|██████    | 3/5 [00:00<00:00,  4.42batch/s]

0
0


Validation sanity check:  80%|████████  | 4/5 [00:00<00:00,  4.55batch/s]

0


Epoch 1:   0%|          | 0/346 [00:00<?, ?batch/s]                      

0


Epoch 1:   0%|          | 1/346 [00:00<01:14,  4.62batch/s, batch_nb=0, loss=2.080, v_nb=9]

None
0


Epoch 1:   1%|          | 2/346 [00:00<01:18,  4.38batch/s, batch_nb=1, loss=2.080, v_nb=9]

None
0
None


Epoch 1:   1%|          | 3/346 [00:00<01:15,  4.55batch/s, batch_nb=2, loss=2.080, v_nb=9]

0


Epoch 1:   1%|          | 4/346 [00:00<01:19,  4.30batch/s, batch_nb=3, loss=2.080, v_nb=9]

None
0


Epoch 1:   1%|▏         | 5/346 [00:01<01:20,  4.24batch/s, batch_nb=4, loss=2.080, v_nb=9]

None
0


Epoch 1:   2%|▏         | 6/346 [00:01<01:18,  4.36batch/s, batch_nb=5, loss=2.080, v_nb=9]

None
0


Epoch 1:   2%|▏         | 7/346 [00:01<01:19,  4.28batch/s, batch_nb=6, loss=2.080, v_nb=9]

None
0


Epoch 1:   2%|▏         | 8/346 [00:01<01:20,  4.21batch/s, batch_nb=7, loss=2.080, v_nb=9]

None
0


Epoch 1:   3%|▎         | 9/346 [00:02<01:32,  3.64batch/s, batch_nb=8, loss=2.080, v_nb=9]

None
0


Epoch 1:   3%|▎         | 10/346 [00:02<01:29,  3.74batch/s, batch_nb=9, loss=2.080, v_nb=9]

None
0


Epoch 1:   3%|▎         | 11/346 [00:02<01:26,  3.89batch/s, batch_nb=10, loss=2.080, v_nb=9]

None
0


Epoch 1:   3%|▎         | 12/346 [00:03<01:41,  3.31batch/s, batch_nb=11, loss=2.080, v_nb=9]

None
0


Epoch 1:   4%|▍         | 13/346 [00:03<01:31,  3.63batch/s, batch_nb=12, loss=2.080, v_nb=9]

None
0


Epoch 1:   4%|▍         | 14/346 [00:03<01:29,  3.72batch/s, batch_nb=13, loss=2.080, v_nb=9]

None
0


Epoch 1:   4%|▍         | 15/346 [00:03<01:27,  3.80batch/s, batch_nb=14, loss=2.080, v_nb=9]

None
0


Epoch 1:   5%|▍         | 16/346 [00:04<01:24,  3.89batch/s, batch_nb=15, loss=2.080, v_nb=9]

None
0


Epoch 1:   5%|▍         | 17/346 [00:04<01:29,  3.67batch/s, batch_nb=16, loss=2.080, v_nb=9]

None
0


Epoch 1:   5%|▌         | 18/346 [00:04<01:26,  3.80batch/s, batch_nb=17, loss=2.080, v_nb=9]

None
0


Epoch 1:   5%|▌         | 19/346 [00:04<01:25,  3.83batch/s, batch_nb=18, loss=2.080, v_nb=9]

None
0


Epoch 1:   6%|▌         | 20/346 [00:05<01:24,  3.87batch/s, batch_nb=19, loss=2.080, v_nb=9]

None
0


Epoch 1:   6%|▌         | 21/346 [00:05<01:27,  3.70batch/s, batch_nb=20, loss=2.080, v_nb=9]

None
0


Epoch 1:   6%|▋         | 22/346 [00:05<01:24,  3.85batch/s, batch_nb=21, loss=2.079, v_nb=9]

None
0


Epoch 1:   7%|▋         | 23/346 [00:05<01:19,  4.09batch/s, batch_nb=22, loss=2.079, v_nb=9]

None
0


Epoch 1:   7%|▋         | 24/346 [00:06<01:23,  3.86batch/s, batch_nb=23, loss=2.079, v_nb=9]

None
0


Epoch 1:   7%|▋         | 25/346 [00:06<01:34,  3.39batch/s, batch_nb=24, loss=2.079, v_nb=9]

None
0


Epoch 1:   8%|▊         | 26/346 [00:06<01:28,  3.60batch/s, batch_nb=25, loss=2.079, v_nb=9]

None
0


Epoch 1:   8%|▊         | 27/346 [00:07<01:22,  3.87batch/s, batch_nb=26, loss=2.079, v_nb=9]

None
0


Epoch 1:   8%|▊         | 28/346 [00:07<01:20,  3.97batch/s, batch_nb=27, loss=2.079, v_nb=9]

None
0


Epoch 1:   8%|▊         | 29/346 [00:07<01:16,  4.15batch/s, batch_nb=28, loss=2.079, v_nb=9]

None
0


Epoch 1:   9%|▊         | 30/346 [00:07<01:15,  4.17batch/s, batch_nb=29, loss=2.079, v_nb=9]

None
0


Epoch 1:   9%|▉         | 31/346 [00:08<01:26,  3.62batch/s, batch_nb=30, loss=2.079, v_nb=9]

None
0


Epoch 1:   9%|▉         | 32/346 [00:08<01:30,  3.48batch/s, batch_nb=31, loss=2.079, v_nb=9]

None
0


Epoch 1:  10%|▉         | 33/346 [00:08<01:29,  3.50batch/s, batch_nb=32, loss=2.079, v_nb=9]

None
0


Epoch 1:  10%|▉         | 34/346 [00:09<01:36,  3.22batch/s, batch_nb=33, loss=2.079, v_nb=9]

None
0


Epoch 1:  10%|█         | 35/346 [00:09<01:38,  3.17batch/s, batch_nb=34, loss=2.079, v_nb=9]

None
0


Epoch 1:  10%|█         | 36/346 [00:09<01:45,  2.93batch/s, batch_nb=35, loss=2.079, v_nb=9]

None
0


Epoch 1:  11%|█         | 37/346 [00:10<01:45,  2.93batch/s, batch_nb=36, loss=2.079, v_nb=9]

None
0


Epoch 1:  11%|█         | 38/346 [00:10<01:52,  2.74batch/s, batch_nb=37, loss=2.079, v_nb=9]

None
0


Epoch 1:  11%|█▏        | 39/346 [00:10<01:49,  2.80batch/s, batch_nb=38, loss=2.079, v_nb=9]

None
0


Epoch 1:  12%|█▏        | 40/346 [00:11<01:41,  3.01batch/s, batch_nb=39, loss=2.079, v_nb=9]

None
0


Epoch 1:  12%|█▏        | 41/346 [00:11<01:40,  3.04batch/s, batch_nb=40, loss=2.079, v_nb=9]

None
0


Epoch 1:  12%|█▏        | 42/346 [00:11<01:35,  3.19batch/s, batch_nb=41, loss=2.079, v_nb=9]

None
0


Epoch 1:  12%|█▏        | 43/346 [00:11<01:26,  3.52batch/s, batch_nb=42, loss=2.079, v_nb=9]

None
0


Epoch 1:  13%|█▎        | 44/346 [00:12<01:22,  3.66batch/s, batch_nb=43, loss=2.079, v_nb=9]

None
0
None


Epoch 1:  13%|█▎        | 45/346 [00:12<01:15,  3.99batch/s, batch_nb=44, loss=2.079, v_nb=9]

0


Epoch 1:  13%|█▎        | 46/346 [00:12<01:12,  4.13batch/s, batch_nb=45, loss=2.079, v_nb=9]

None
0


Epoch 1:  14%|█▎        | 47/346 [00:12<01:09,  4.28batch/s, batch_nb=46, loss=2.079, v_nb=9]

None
0


Epoch 1:  14%|█▍        | 48/346 [00:13<01:07,  4.42batch/s, batch_nb=47, loss=2.079, v_nb=9]

None
0
None


Epoch 1:  14%|█▍        | 50/346 [00:13<01:02,  4.76batch/s, batch_nb=49, loss=2.079, v_nb=9]

0
None
0


Epoch 1:  15%|█▍        | 51/346 [00:13<01:05,  4.53batch/s, batch_nb=50, loss=2.079, v_nb=9]

None
0


Epoch 1:  15%|█▌        | 52/346 [00:13<01:08,  4.30batch/s, batch_nb=51, loss=2.079, v_nb=9]

None
0


Epoch 1:  15%|█▌        | 53/346 [00:14<01:15,  3.91batch/s, batch_nb=52, loss=2.079, v_nb=9]

None
0


Epoch 1:  16%|█▌        | 54/346 [00:14<01:12,  4.05batch/s, batch_nb=53, loss=2.079, v_nb=9]

None
0


Epoch 1:  16%|█▌        | 55/346 [00:14<01:14,  3.92batch/s, batch_nb=54, loss=2.079, v_nb=9]

None
0


Epoch 1:  16%|█▌        | 56/346 [00:15<01:20,  3.60batch/s, batch_nb=55, loss=2.079, v_nb=9]

None
0


Epoch 1:  16%|█▋        | 57/346 [00:15<01:20,  3.61batch/s, batch_nb=56, loss=2.079, v_nb=9]

None
0
None


Epoch 1:  17%|█▋        | 59/346 [00:15<01:06,  4.31batch/s, batch_nb=58, loss=2.079, v_nb=9]

0
None
0


Epoch 1:  18%|█▊        | 61/346 [00:16<01:01,  4.65batch/s, batch_nb=60, loss=2.079, v_nb=9]

None
0
None
0


Epoch 1:  18%|█▊        | 62/346 [00:16<00:59,  4.78batch/s, batch_nb=61, loss=2.079, v_nb=9]

None
0


Epoch 1:  18%|█▊        | 63/346 [00:16<00:59,  4.77batch/s, batch_nb=62, loss=2.079, v_nb=9]

None
0


Epoch 1:  18%|█▊        | 64/346 [00:16<00:59,  4.74batch/s, batch_nb=63, loss=2.079, v_nb=9]

None
0


Epoch 1:  19%|█▉        | 65/346 [00:16<00:59,  4.73batch/s, batch_nb=64, loss=2.079, v_nb=9]

None
0


Epoch 1:  19%|█▉        | 66/346 [00:17<01:06,  4.23batch/s, batch_nb=65, loss=2.079, v_nb=9]

None
0


Epoch 1:  19%|█▉        | 67/346 [00:17<01:15,  3.72batch/s, batch_nb=66, loss=2.079, v_nb=9]

None
0


Epoch 1:  20%|█▉        | 68/346 [00:17<01:12,  3.86batch/s, batch_nb=67, loss=2.079, v_nb=9]

None
0


Epoch 1:  20%|█▉        | 69/346 [00:18<01:25,  3.23batch/s, batch_nb=68, loss=2.079, v_nb=9]

None
0


Epoch 1:  20%|██        | 70/346 [00:18<01:34,  2.91batch/s, batch_nb=69, loss=2.079, v_nb=9]

None
0


Epoch 1:  21%|██        | 71/346 [00:18<01:23,  3.30batch/s, batch_nb=70, loss=2.079, v_nb=9]

None
0


Epoch 1:  21%|██        | 72/346 [00:19<01:18,  3.49batch/s, batch_nb=71, loss=2.079, v_nb=9]

None
0


Epoch 1:  21%|██▏       | 74/346 [00:19<01:13,  3.72batch/s, batch_nb=73, loss=2.079, v_nb=9]

None
0
None
0


Epoch 1:  22%|██▏       | 75/346 [00:20<01:20,  3.36batch/s, batch_nb=74, loss=2.079, v_nb=9]

None
0


Epoch 1:  22%|██▏       | 76/346 [00:20<01:15,  3.58batch/s, batch_nb=75, loss=2.079, v_nb=9]

None
0


Epoch 1:  22%|██▏       | 77/346 [00:20<01:24,  3.16batch/s, batch_nb=76, loss=2.078, v_nb=9]

None
0


Epoch 1:  23%|██▎       | 78/346 [00:20<01:23,  3.21batch/s, batch_nb=77, loss=2.078, v_nb=9]

None
0


Epoch 1:  23%|██▎       | 79/346 [00:21<01:15,  3.52batch/s, batch_nb=78, loss=2.078, v_nb=9]

None
0


Epoch 1:  23%|██▎       | 81/346 [00:21<01:03,  4.17batch/s, batch_nb=80, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  24%|██▍       | 83/346 [00:21<00:56,  4.64batch/s, batch_nb=82, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  25%|██▍       | 85/346 [00:22<00:51,  5.05batch/s, batch_nb=84, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  25%|██▍       | 86/346 [00:22<00:50,  5.16batch/s, batch_nb=85, loss=2.078, v_nb=9]

None
0


Epoch 1:  25%|██▌       | 87/346 [00:22<00:52,  4.92batch/s, batch_nb=86, loss=2.078, v_nb=9]

None
0


Epoch 1:  25%|██▌       | 88/346 [00:22<00:53,  4.83batch/s, batch_nb=87, loss=2.078, v_nb=9]

None
0


Epoch 1:  26%|██▌       | 89/346 [00:23<00:54,  4.72batch/s, batch_nb=88, loss=2.078, v_nb=9]

None
0


Epoch 1:  26%|██▋       | 91/346 [00:23<00:51,  4.92batch/s, batch_nb=90, loss=2.078, v_nb=9]

None
0
None


Epoch 1:  27%|██▋       | 92/346 [00:23<00:51,  4.92batch/s, batch_nb=91, loss=2.078, v_nb=9]

0
None
0


Epoch 1:  27%|██▋       | 93/346 [00:24<00:51,  4.93batch/s, batch_nb=92, loss=2.078, v_nb=9]

None
0
None


Epoch 1:  27%|██▋       | 95/346 [00:24<00:49,  5.06batch/s, batch_nb=94, loss=2.078, v_nb=9]

0
None
0


Epoch 1:  28%|██▊       | 96/346 [00:24<00:49,  5.09batch/s, batch_nb=95, loss=2.078, v_nb=9]

None
0


Epoch 1:  28%|██▊       | 98/346 [00:24<00:48,  5.10batch/s, batch_nb=97, loss=2.078, v_nb=9]

None
0
None
0
None


Epoch 1:  29%|██▉       | 100/346 [00:25<00:49,  5.01batch/s, batch_nb=99, loss=2.078, v_nb=9]

0
None
0


Epoch 1:  29%|██▉       | 101/346 [00:25<00:50,  4.85batch/s, batch_nb=100, loss=2.078, v_nb=9]

None
0


Epoch 1:  29%|██▉       | 102/346 [00:25<00:50,  4.83batch/s, batch_nb=101, loss=2.078, v_nb=9]

None
0


Epoch 1:  30%|██▉       | 103/346 [00:26<00:51,  4.69batch/s, batch_nb=102, loss=2.078, v_nb=9]

None
0


Epoch 1:  30%|███       | 104/346 [00:26<00:54,  4.40batch/s, batch_nb=103, loss=2.078, v_nb=9]

None
0


Epoch 1:  30%|███       | 105/346 [00:26<00:53,  4.49batch/s, batch_nb=104, loss=2.078, v_nb=9]

None
0


Epoch 1:  31%|███       | 107/346 [00:26<00:49,  4.80batch/s, batch_nb=106, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  32%|███▏      | 109/346 [00:27<00:47,  4.99batch/s, batch_nb=108, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  32%|███▏      | 111/346 [00:27<00:45,  5.20batch/s, batch_nb=110, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  33%|███▎      | 113/346 [00:28<00:46,  5.03batch/s, batch_nb=112, loss=2.078, v_nb=9]

None
0
None
0


Epoch 1:  33%|███▎      | 114/346 [00:28<00:46,  4.99batch/s, batch_nb=113, loss=2.078, v_nb=9]

None
0


Epoch 1:  33%|███▎      | 115/346 [00:28<00:47,  4.86batch/s, batch_nb=114, loss=2.077, v_nb=9]

None
0


Epoch 1:  34%|███▎      | 116/346 [00:28<00:49,  4.62batch/s, batch_nb=115, loss=2.077, v_nb=9]

None
0


Epoch 1:  34%|███▍      | 117/346 [00:28<00:50,  4.56batch/s, batch_nb=116, loss=2.077, v_nb=9]

None
0


Epoch 1:  34%|███▍      | 118/346 [00:29<00:50,  4.55batch/s, batch_nb=117, loss=2.077, v_nb=9]

None
0


Epoch 1:  34%|███▍      | 119/346 [00:29<00:49,  4.55batch/s, batch_nb=118, loss=2.077, v_nb=9]

None
0


Epoch 1:  35%|███▍      | 120/346 [00:29<00:49,  4.58batch/s, batch_nb=119, loss=2.077, v_nb=9]

None
0


Epoch 1:  35%|███▍      | 121/346 [00:29<00:48,  4.68batch/s, batch_nb=120, loss=2.077, v_nb=9]

None
0


Epoch 1:  35%|███▌      | 122/346 [00:30<00:48,  4.63batch/s, batch_nb=121, loss=2.077, v_nb=9]

None
0


Epoch 1:  36%|███▌      | 123/346 [00:30<00:51,  4.37batch/s, batch_nb=122, loss=2.077, v_nb=9]

None
0
None


Epoch 1:  36%|███▌      | 125/346 [00:30<00:46,  4.77batch/s, batch_nb=124, loss=2.077, v_nb=9]

0
None
0


Epoch 1:  37%|███▋      | 127/346 [00:31<00:44,  4.98batch/s, batch_nb=126, loss=2.077, v_nb=9]

None
0
None
0


Epoch 1:  37%|███▋      | 128/346 [00:31<00:44,  4.94batch/s, batch_nb=127, loss=2.077, v_nb=9]

None
0


Epoch 1:  37%|███▋      | 129/346 [00:31<00:45,  4.78batch/s, batch_nb=128, loss=2.077, v_nb=9]

None
0


Epoch 1:  38%|███▊      | 130/346 [00:31<00:44,  4.81batch/s, batch_nb=129, loss=2.077, v_nb=9]

None
0


KeyboardInterrupt: 

We also create a python file for automatic hyperparameter optimization across different GPUs or CPUs:

In [None]:
%%writefile main.py

from emotion_transformer.lightning import EmotionModel, get_args, main

if __name__ == '__main__':
    hparams = get_args(EmotionModel)
    hparams = hparams.parse_args()

    if hparams.mode in ['test','default']:
        main(hparams)
    elif hparams.mode == 'hparams_search':
        if hparams.gpus:
            hparams.optimize_parallel_gpu(main, max_nb_trials=20, 
                                          gpu_ids = [gpus for gpus in hparams.gpus.split(' ')])
        else:
            hparams.optimize_parallel_cpu(main, nb_trials=20, nb_workers=4)

Overwriting main.py


## Background Information

For the interested reader we provide some background information on the (distributed) training loop:

* one epoch consists of m = ceil(30160/batchsize) batches for the training and additional n = ceil(2755/batchsize) 
batches for the validation.

**dp case:** 

* the batchsize will be split and each gpu receives (up to rounding) a batch of size batchsize/num_gpus

* in the validation steps each gpu computes its own scores for each of the n batches (of size batchsize/num_gpus), i.e. each gpu calls the `validation_step` method

* the `output` which is passed to the `validation_end` method consists of list of dictionaries (containing the concatenated scores from the different gpus), i.e.

`output = [ {first_metric: [first_gpu_batch_1,...,last_gpu_batch_1],...,
             last_metric:  [first_gpu_batch_1,...,last_gpu_batch_1]},..., 
            {first_metric: [first_gpu_batch_n,...,last_gpu_batch_n],...,
             last_metric:  [first_gpu_batch_n,...,last_gpu_batch_n]} ]`


**ddp case:** (does not work from jupyter notebooks)

* the gpus receive (disjoint) samples of size batchsize and train on own processes but communicate and average their gradients (thus the resulting models on each gpu have the same weights)

* each gpu computes its own validation_end method and its own list of dictionaries 

`output_first_gpu = [ {first_metric: batch_1,...,last_metric: batch_1},..., 
                      {first_metric: batch_n,...,last_metric: batch_n} ]`
                      
`output_last_gpu = [ {first_metric: batch_1,...,last_metric: batch_1},..., 
                      {first_metric: batch_n,...,last_metric: batch_n} ]`


**ddp case:** (does not work from jupyter notebooks)

*  on each node we have the dp case but the nodes communicate analogous to the ddp case

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_dataloader.ipynb.
Converted 01_model.ipynb.
Converted 02_lightning.ipynb.
Converted index.ipynb.
