[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/juansensio/blog/blob/master/072_pytorch_ngc/072_pytorch_ngc.ipynb)

# Pytorch Lightning - Optimización

Por suerte, todo lo que hemos visto hasta ahora está implementado en `Pytorch Lightning`, por lo que no tenemos que comernos mucho la cabeza.

docker-compose build

In [1]:
import os
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import torch
from skimage import io 
from torch.utils.data import DataLoader

class Dataset(torch.utils.data.Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, ix):
        img = io.imread(self.images[ix])[...,(3,2,1)]
        img = torch.tensor(img / 4000, dtype=torch.float).clip(0,1).permute(2,0,1)  
        label = torch.tensor(self.labels[ix], dtype=torch.long)        
        return img, label
    
class DataModule(pl.LightningDataModule):

    def __init__(self, path='./data', batch_size=1024, num_workers=20, test_size=0.2, random_state=42):
        super().__init__()
        self.path = path
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.test_size = test_size 
        self.random_state = random_state
        
        
    def setup(self, stage=None):

        self.classes = sorted(os.listdir(self.path))

        print("Generating images and labels ...")
        images, encoded = [], []
        for ix, label in enumerate(self.classes):
            _images = os.listdir(f'{self.path}/{label}')
            images += [f'{self.path}/{label}/{img}' for img in _images]
            encoded += [ix]*len(_images)
        print(f'Number of images: {len(images)}')

         # train / val split
        print("Generating train / val splits ...")
        train_images, val_images, train_labels, val_labels = train_test_split(
            images,
            encoded,
            stratify=encoded,
            test_size=self.test_size,
            random_state=self.random_state
        )

        print("Training samples: ", len(train_labels))
        print("Validation samples: ", len(val_labels))
        
        self.train_ds = Dataset(train_images, train_labels)
        self.val_ds = Dataset(val_images, val_labels)

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
            pin_memory=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            pin_memory=True
        )

  rank_zero_deprecation(


In [2]:
dm = DataModule()
dm.setup()

imgs, labels = next(iter(dm.train_dataloader()))
imgs.shape, labels.shape

Generating images and labels ...
Number of images: 27000
Generating train / val splits ...
Training samples:  21600
Validation samples:  5400


(torch.Size([1024, 3, 64, 64]), torch.Size([1024]))

In [7]:
import torch.nn.functional as F
import torchvision

class Model(pl.LightningModule):

    def __init__(self, n_outputs=10):
        super().__init__()
        self.model = torchvision.models.resnet50(pretrained=True, progress=None)
        self.model.fc = torch.nn.Linear(2048, n_outputs)

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        loss, acc = self.shared_step(batch)
        self.log('loss', loss)
        self.log('acc', acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc = self.shared_step(batch)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def shared_step(self, batch):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        acc = (torch.argmax(y_hat, axis=1) == y).sum().item() / y.size(0)
        return loss, acc

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

In [9]:
model = Model()
dm = DataModule()
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=3)
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 23.5 M
---------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Generating images and labels ...
Number of images: 27000
Generating train / val splits ...
Training samples:  21600
Validation samples:  5400
Epoch 0:  79%|███████▊  | 22/28 [00:04<00:01,  4.46it/s, loss=0.266, v_num=1, val_loss=2.340, val_acc=0.0918, acc=0.963]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/6 [00:00<?, ?it/s][A
Epoch 0:  86%|████████▌ | 24/28 [00:06<00:01,  3.73it/s, loss=0.266, v_num=1, val_loss=2.340, val_acc=0.0918, acc=0.963]
Epoch 0:  93%|█████████▎| 26/28 [00:06<00:00,  3.97it/s, loss=0.266, v_num=1, val_loss=2.340, val_acc=0.0918, acc=0.963]
Epoch 0: 100%|██████████| 28/28 [00:06<00:00,  4.14it/s, loss=0.266, v_num=1, val_loss=1.290, val_acc=0.742, acc=0.938] 
Epoch 1:  79%|███████▊  | 22/28 [00:04<00:01,  4.44it/s, loss=0.0984, v_num=1, val_loss=1.290, val_acc=0.742, acc=0.968]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/6 [00:00<?, ?it/s][A
Epoch 1:  86%|████████▌ | 24/28 [00:06<00:01,  3.71it/s, loss=0.0984, v_num=1, 

In [11]:
model = Model()
dm = DataModule(batch_size=2048)
trainer = pl.Trainer(gpus=2, accelerator='dp', precision=16, max_epochs=3)
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 23.5 M
---------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Generating images and labels ...
Number of images: 27000
Generating train / val splits ...
Training samples:  21600
Validation samples:  5400
Epoch 0:  79%|███████▊  | 11/14 [00:04<00:01,  2.45it/s, loss=0.636, v_num=3, val_loss=2.380, val_acc=0.146, acc=0.943]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 0:  93%|█████████▎| 13/14 [00:06<00:00,  1.89it/s, loss=0.636, v_num=3, val_loss=2.380, val_acc=0.146, acc=0.943]
Epoch 0: 100%|██████████| 14/14 [00:07<00:00,  1.97it/s, loss=0.636, v_num=3, val_loss=42.20, val_acc=0.202, acc=0.930]
Epoch 1:  86%|████████▌ | 12/14 [00:04<00:00,  2.73it/s, loss=0.202, v_num=3, val_loss=42.20, val_acc=0.202, acc=0.975]
Validating: 0it [00:00, ?it/s][A
Epoch 1:  11%|█         | 3/28 [00:21<02:55,  7.02s/it, loss=0.193, v_num=2, val_loss=4.240, val_acc=0.568, acc=0.969]
Epoch 1:  11%|█         | 3/28 [00:21<02:55,  7.04s/it, loss=0.193, v_num=2, val_loss=4.240, val_acc=0.568, acc=0.969]

Epoch 1: 100%|████

## Profiling

In [15]:
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=1, profiler='simple')
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 23.5 M
---------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Epoch 0:  79%|███████▊  | 11/14 [00:05<00:01,  2.07it/s, loss=0.374, v_num=5, val_loss=0.436, val_acc=0.904, acc=0.958]
Validating: 0it [00:00, ?it/s][A
Epoch 2:  86%|████████▌ | 12/14 [00:22<00:03,  1.89s/it, loss=0.072, v_num=4, val_loss=0.428, val_acc=0.904, acc=0.986]
Epoch 0:  93%|█████████▎| 13/14 [00:07<00:00,  1.68it/s, loss=0.374, v_num=5, val_loss=0.436, val_acc=0.904, acc=0.958]
Epoch 0: 100%|██████████| 14/14 [00:08<00:00,  1.75it/s, loss=0.374, v_num=5, val_loss=18.20, val_acc=0.307, acc=0.959]
Epoch 0: 100%|██████████| 14/14 [00:08<00:00,  1.65it/s, loss=0.374, v_num=5, val_loss=18.20, val_acc=0.307, acc=0.959]

FIT Profiler Report

Action                             	|  Mean duration (s)	|Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------
Total                              	|  -              	|_              	|  11.197         	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------------------
run_training_epoch                 	|  8.4694         	|1              	|  8.4694         	|  75.639         	|
run_training_batch                 	|  0.24904        	|11             	|  2.7394         	|  24.465         	|
optimizer_step_and_closure_0       	|  0.24848        	|11             	|  2.7333         	|  24.411         	|
get_train_batch                    	|  0.23089        	|11             	|  2.5398         	|  22.682         	|
training_step_and_backward         




```
docker run --gpus all --ipc=host --rm -v $PWD/073_pytorch_lightning_optim:/workspace -v $PWD/073_pytorch_lightning_optim/data:/workspace/data  pytorch-ngc python train.py
```

In [16]:
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=1, profiler='pytorch')
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 23.5 M
---------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Epoch 2:  86%|████████▌ | 12/14 [05:24<00:54, 27.01s/it, loss=0.072, v_num=4, val_loss=0.428, val_acc=0.904, acc=0.986]

Epoch 0:  79%|███████▊  | 11/14 [00:08<00:02,  1.28it/s, loss=0.241, v_num=7, val_loss=18.00, val_acc=0.305, acc=0.956]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 0:  93%|█████████▎| 13/14 [00:11<00:00,  1.15it/s, loss=0.241, v_num=7, val_loss=18.00, val_acc=0.305, acc=0.956]
Epoch 0: 100%|██████████| 14/14 [00:11<00:00,  1.20it/s, loss=0.241, v_num=7, val_loss=0.798, val_acc=0.816, acc=0.965]
Epoch 0: 100%|██████████| 14/14 [00:12<00:00,  1.15it/s, loss=0.241, v_num=7, val_loss=0.798, val_acc=0.816, acc=0.965]


FIT Profiler Report
Profile stats for: records
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*        60.05%     407.160ms        99.99%     677.973ms     338.986ms      22.919ms        25.80%      88.819ms      44.410ms             2  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        23.49%     159.271ms        23.49%     159.275ms      79.638ms       0.000us         0.00%       

In [24]:
from torch.profiler import profile
from pytorch_lightning.callbacks import Callback

class ProfilerCallback(Callback):
    
    def __init__(self, prof):
        super().__init__()
        self.prof = prof

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
        self.prof.step()


with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/lightning_profile2'),
        record_shapes=True,
        with_stack=True
) as prof:
    trainer = pl.Trainer(
        gpus=1, 
        precision=16, 
        max_epochs=1, 
        limit_train_batches=(1+1+3)*2, 
        limit_val_batches=0,
        callbacks=[ProfilerCallback(prof)]
    )
    trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 23.5 M
---------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 10/10 [00:13<00:00,  1.32s/it, loss=0.0997, v_num=13, acc=0.977]
