In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import torchvision
from torchvision import transforms
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from libs.code import *
from pytorch_lightning.loggers import TensorBoardLogger
# from libs.VAE import *

In [2]:

def get_train_images(num):
    return torch.stack([dataset_train[i][0] for i in range(num)], dim=0)

class Encoder(nn.Module):
    
    def __init__(self, 
                num_input_channels : int, 
                base_channel_size : int, 
                latent_dim : int, 
                act_fn : object = nn.GELU):
        """
        Inputs: 
            - num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
            - base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
            - latent_dim : Dimensionality of latent representation z
            - act_fn : Activation function used throughout the encoder network
        """
        super().__init__()
        c_hid = base_channel_size
        self.net = nn.Sequential(
            nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2), # 32x32 => 16x16
            act_fn(),
            nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.Conv2d(c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 16x16 => 8x8
            act_fn(),
            nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 8x8 => 4x4
            act_fn(),
            nn.Flatten(), # Image grid to single feature vector
            nn.Linear(2*16*c_hid, latent_dim)
        )
    
    def forward(self, x):
        return self.net(x)

class Decoder(nn.Module):
    
    def __init__(self, 
                num_input_channels : int, 
                base_channel_size : int, 
                latent_dim : int, 
                act_fn : object = nn.GELU):
        """
        Inputs: 
            - num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
            - base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
            - latent_dim : Dimensionality of latent representation z
            - act_fn : Activation function used throughout the decoder network
        """
        super().__init__()
        c_hid = base_channel_size
        self.linear = nn.Sequential(
            nn.Linear(latent_dim, 2*16*c_hid),
            act_fn()
        )
        self.net = nn.Sequential(
            nn.ConvTranspose2d(2*c_hid, 2*c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 4x4 => 8x8
            act_fn(),
            nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.ConvTranspose2d(2*c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 8x8 => 16x16
            act_fn(),
            nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.ConvTranspose2d(c_hid, num_input_channels, kernel_size=3, output_padding=1, padding=1, stride=2), # 16x16 => 32x32
            nn.Tanh() # The input images is scaled between -1 and 1, hence the output has to be bounded as well
        )
    
    def forward(self, x):
        x = self.linear(x)
        x = x.reshape(x.shape[0], -1, 4, 4)
        x = self.net(x)
        return x

class Autoencoder(pl.LightningModule):
    
    def __init__(self, 
                base_channel_size: int, 
                latent_dim: int, 
                encoder_class : object = Encoder,
                decoder_class : object = Decoder,
                num_input_channels: int = 3, 
                width: int = 32, 
                height: int = 32):
        super().__init__()
        # Saving hyperparameters of autoencoder
        self.save_hyperparameters() 
        # Creating encoder and decoder
        self.encoder = encoder_class(num_input_channels, base_channel_size, latent_dim)
        self.decoder = decoder_class(num_input_channels, base_channel_size, latent_dim)
        # Example input array needed for visualizing the graph of the network
        self.example_input_array = torch.zeros(2, num_input_channels, width, height)
        
    def forward(self, x):
        """
        The forward function takes in an image and returns the reconstructed image
        """
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat
    
    def _get_reconstruction_loss(self, batch):
        """
        Given a batch of images, this function returns the reconstruction loss (MSE in our case)
        """
        x, _ = batch # We do not need the labels
        x_hat = self.forward(x)
        loss = F.mse_loss(x, x_hat, reduction="none")
        loss = loss.sum(dim=[1,2,3]).mean(dim=[0])
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        # Using a scheduler is optional but can be helpful.
        # The scheduler reduces the LR if the validation performance hasn't improved for the last N epochs
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                        mode='min', 
                                                        factor=0.2, 
                                                        patience=20, 
                                                        min_lr=5e-5)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
    
    def training_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)                             
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)
        self.log('val_loss', loss)
    
    def test_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)
        self.log('test_loss', loss)

class GenerateCallback(pl.Callback):
    
    def __init__(self, input_imgs, every_n_epochs=1):
        super().__init__()
        self.input_imgs = input_imgs # Images to reconstruct during training
        self.every_n_epochs = every_n_epochs # Only save those images every N epochs (otherwise tensorboard gets quite large)
        
    def on_epoch_end(self, trainer, pl_module):
        if trainer.current_epoch % self.every_n_epochs == 0:
            # Reconstruct images
            input_imgs = self.input_imgs.to(pl_module.device)
            with torch.no_grad():
                pl_module.eval()
                reconst_imgs = pl_module(input_imgs)
                pl_module.train()
            # Plot and add to tensorboard
            imgs = torch.stack([input_imgs, reconst_imgs], dim=1).flatten(0,1)
            grid = torchvision.utils.make_grid(imgs, nrow=2, normalize=True, range=(-1,1))
            trainer.logger.experiment.add_image("Reconstructions", grid, global_step=trainer.global_step)

In [3]:
# Setting the seed
pl.seed_everything(42)
# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

PATH_DST = 'dataset/all_labels.csv'
PATH_GDRIVE = ''
NUM_WORKERS = 4
BATCH_SIZE = 32
NUM_EPOCHS = 5
GPUS = 0

Global seed set to 42


Device: cpu


In [4]:
transform = transforms.Compose([
                                transforms.Resize((32,32)),
                                transforms.ToTensor(),
                                # torch.flatten # trasforma il tensore ad una dimensione
                                ])

dataset = TrashbinDataset(csv=PATH_DST, transform=transform)

# TODO: fixa la funzione!
dataset_train, dataset_test = split_into_train_and_test(dataset)
_, dataset_val = split_into_train_and_test(dataset)

train_loader = data.DataLoader(dataset_train, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True)
val_loader = data.DataLoader(dataset_val, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)
test_loader = data.DataLoader(dataset_test, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)

In [5]:
def train(latent_dim, checkpoint_path, name):
    # Create a PyTorch Lightning trainer with the generation callback

    my_logger = TensorBoardLogger(save_dir=f"{checkpoint_path}_logger", name= f"{name}_{latent_dim}")

    trainer = pl.Trainer(default_root_dir=os.path.join(f"{checkpoint_path}_trainer", f"{name}_{latent_dim}"), 
                        gpus=1 if str(device).startswith("cuda") else 0, 
                        max_epochs=10, 
                        callbacks=[ModelCheckpoint(save_weights_only=True),
                                    GenerateCallback(get_train_images(8), every_n_epochs=5),
                                    LearningRateMonitor("epoch")]
                        ,logger=my_logger)
                        
    # trainer.logger._log_graph = True         # If True, we plot the computation graph in tensorboard
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need
    
    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(checkpoint_path, f"{name}_{latent_dim}.ckpt")
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...*****")
        model = Autoencoder.load_from_checkpoint(pretrained_filename)
    else:
        model = Autoencoder(base_channel_size=32, latent_dim=latent_dim)
        trainer.fit(model, train_loader, val_loader)
    
    # Test best model on validation and test set
    val_result = trainer.test(model, test_dataloaders=val_loader, verbose=False)
    test_result = trainer.test(model, test_dataloaders=test_loader, verbose=False)
    result = {"test": test_result, "val": val_result}
    return model, result

In [6]:
model_dict = {}
for latent_dim in [64, 128, 256, 384]:
    model_ld, result_ld = train(latent_dim, f"VAE_{latent_dim}", f"VAE_{latent_dim}")
    model_dict[latent_dim] = {"model": model_ld, "result": result_ld}

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: VAE_64_logger/VAE_64_64

  | Name    | Type    | Params | In sizes       | Out sizes     
----------------------------------------------------------------------
0 | encoder | Encoder | 168 K  | [2, 3, 32, 32] | [2, 64]       
1 | decoder | Decoder | 168 K  | [2, 64]        | [2, 3, 32, 32]
----------------------------------------------------------------------
337 K     Trainable params
0         Non-trainable params
337 K     Total params
1.348     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                      

Global seed set to 42


Epoch 0:   0%|          | 0/413 [00:00<?, ?it/s] 



Epoch 0:  80%|███████▉  | 330/413 [01:22<00:20,  3.98it/s, loss=51.2, v_num=0]



Epoch 1:   0%|          | 0/413 [00:00<?, ?it/s, loss=51.2, v_num=0]          



Epoch 1:  80%|███████▉  | 330/413 [01:17<00:19,  4.27it/s, loss=39.1, v_num=0]



Epoch 2:   0%|          | 0/413 [00:00<?, ?it/s, loss=39.1, v_num=0]          



Epoch 2:  80%|███████▉  | 330/413 [01:18<00:19,  4.22it/s, loss=31.8, v_num=0]



Epoch 3:   0%|          | 0/413 [00:00<?, ?it/s, loss=31.8, v_num=0]          



Epoch 3:  80%|███████▉  | 330/413 [01:18<00:19,  4.21it/s, loss=26.7, v_num=0]



Epoch 4:   0%|          | 0/413 [00:00<?, ?it/s, loss=26.7, v_num=0]          



Epoch 4:  80%|███████▉  | 330/413 [01:17<00:19,  4.27it/s, loss=24.4, v_num=0]



Epoch 5:   0%|          | 0/413 [00:00<?, ?it/s, loss=24.4, v_num=0]          



Epoch 5:  80%|███████▉  | 330/413 [01:19<00:20,  4.15it/s, loss=22.3, v_num=0]



Epoch 6:   0%|          | 0/413 [00:00<?, ?it/s, loss=22.3, v_num=0]          



Epoch 6:  80%|███████▉  | 330/413 [01:18<00:19,  4.19it/s, loss=20.5, v_num=0]



Epoch 7:   0%|          | 0/413 [00:00<?, ?it/s, loss=20.5, v_num=0]          



Epoch 7:  80%|███████▉  | 330/413 [01:18<00:19,  4.22it/s, loss=18.9, v_num=0]



Epoch 8:   0%|          | 0/413 [00:00<?, ?it/s, loss=18.9, v_num=0]          



Epoch 8:  80%|███████▉  | 330/413 [01:18<00:19,  4.21it/s, loss=17.7, v_num=0]



Epoch 9:   0%|          | 0/413 [00:00<?, ?it/s, loss=17.7, v_num=0]          



Epoch 9:  80%|███████▉  | 330/413 [01:17<00:19,  4.24it/s, loss=17.9, v_num=0]



Epoch 9: 100%|██████████| 413/413 [01:36<00:00,  4.26it/s, loss=17.9, v_num=0]

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."



Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:20<00:00,  4.06it/s]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:19<00:00,  4.32it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: VAE_128_logger/VAE_128_128

  | Name    | Type    | Params | In sizes       | Out sizes     
----------------------------------------------------------------------
0 | encoder | Encoder | 233 K  | [2, 3, 32, 32] | [2, 128]      
1 | decoder | Decoder | 234 K  | [2, 128]       | [2, 3, 32, 32]
----------------------------------------------------------------------
468 K     Trainable params
0         Non-trainable params
468 K     Total params
1.873     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                      

Global seed set to 42


Epoch 0:   0%|          | 0/413 [00:00<?, ?it/s] 



Epoch 0:  80%|███████▉  | 330/413 [01:18<00:19,  4.23it/s, loss=64.4, v_num=0]   



Epoch 1:   0%|          | 0/413 [00:00<?, ?it/s, loss=64.4, v_num=0]          



Epoch 1:  80%|███████▉  | 330/413 [01:16<00:19,  4.30it/s, loss=43.4, v_num=0]



Epoch 2:   0%|          | 0/413 [00:00<?, ?it/s, loss=43.4, v_num=0]          



Epoch 2:  80%|███████▉  | 330/413 [01:17<00:19,  4.24it/s, loss=32.8, v_num=0]



Epoch 3:   0%|          | 0/413 [00:00<?, ?it/s, loss=32.8, v_num=0]          



Epoch 3:  80%|███████▉  | 330/413 [01:23<00:20,  3.96it/s, loss=27.2, v_num=0]



Epoch 4:   0%|          | 0/413 [00:00<?, ?it/s, loss=27.2, v_num=0]          



Epoch 4:  80%|███████▉  | 330/413 [01:22<00:20,  4.02it/s, loss=24.2, v_num=0]



Epoch 5:   0%|          | 0/413 [00:00<?, ?it/s, loss=24.2, v_num=0]          



Epoch 5:  80%|███████▉  | 330/413 [01:20<00:20,  4.12it/s, loss=22.4, v_num=0]



Epoch 6:   0%|          | 0/413 [00:00<?, ?it/s, loss=22.4, v_num=0]          



Epoch 6:  80%|███████▉  | 330/413 [01:27<00:21,  3.78it/s, loss=19.4, v_num=0]



Epoch 7:   0%|          | 0/413 [00:00<?, ?it/s, loss=19.4, v_num=0]          



Epoch 7:  80%|███████▉  | 330/413 [01:27<00:21,  3.79it/s, loss=17.6, v_num=0]



Epoch 8:   0%|          | 0/413 [00:00<?, ?it/s, loss=17.6, v_num=0]          



Epoch 8:  80%|███████▉  | 330/413 [01:27<00:21,  3.79it/s, loss=16.3, v_num=0]



Epoch 9:   0%|          | 0/413 [00:00<?, ?it/s, loss=16.3, v_num=0]          



Epoch 9:  80%|███████▉  | 330/413 [01:27<00:22,  3.75it/s, loss=16.1, v_num=0]



Epoch 9: 100%|██████████| 413/413 [01:49<00:00,  3.78it/s, loss=16.1, v_num=0]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:21<00:00,  3.78it/s]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:20<00:00,  4.06it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: VAE_256_logger/VAE_256_256

  | Name    | Type    | Params | In sizes       | Out sizes     
----------------------------------------------------------------------
0 | encoder | Encoder | 364 K  | [2, 3, 32, 32] | [2, 256]      
1 | decoder | Decoder | 365 K  | [2, 256]       | [2, 3, 32, 32]
----------------------------------------------------------------------
730 K     Trainable params
0         Non-trainable params
730 K     Total params
2.922     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                      

Global seed set to 42


Epoch 0:   0%|          | 0/413 [00:00<?, ?it/s] 



Epoch 0:  80%|███████▉  | 330/413 [01:22<00:20,  3.99it/s, loss=55.5, v_num=0]



Epoch 1:   0%|          | 0/413 [00:00<?, ?it/s, loss=55.5, v_num=0]          



Epoch 1:  80%|███████▉  | 330/413 [01:20<00:20,  4.11it/s, loss=38.3, v_num=0]



Epoch 2:   0%|          | 0/413 [00:00<?, ?it/s, loss=38.3, v_num=0]          



Epoch 2:  80%|███████▉  | 330/413 [01:25<00:21,  3.85it/s, loss=29.7, v_num=0]



Epoch 3:   0%|          | 0/413 [00:00<?, ?it/s, loss=29.7, v_num=0]          



Epoch 3:  80%|███████▉  | 330/413 [01:24<00:21,  3.89it/s, loss=23.3, v_num=0]



Epoch 4:   0%|          | 0/413 [00:00<?, ?it/s, loss=23.3, v_num=0]          



Epoch 4:  80%|███████▉  | 330/413 [01:23<00:20,  3.96it/s, loss=21.2, v_num=0]



Epoch 5:   0%|          | 0/413 [00:00<?, ?it/s, loss=21.2, v_num=0]          



Epoch 5:  80%|███████▉  | 330/413 [01:24<00:21,  3.92it/s, loss=18.8, v_num=0]



Epoch 6:   0%|          | 0/413 [00:00<?, ?it/s, loss=18.8, v_num=0]          



Epoch 6:  80%|███████▉  | 330/413 [01:23<00:20,  3.96it/s, loss=17.1, v_num=0]



Epoch 7:   0%|          | 0/413 [00:00<?, ?it/s, loss=17.1, v_num=0]          



Epoch 7:  80%|███████▉  | 330/413 [01:23<00:20,  3.95it/s, loss=15.7, v_num=0]



Epoch 8:   0%|          | 0/413 [00:00<?, ?it/s, loss=15.7, v_num=0]          



Epoch 8:  80%|███████▉  | 330/413 [01:21<00:20,  4.05it/s, loss=14.3, v_num=0]



Epoch 9:   0%|          | 0/413 [00:00<?, ?it/s, loss=14.3, v_num=0]          



Epoch 9:  80%|███████▉  | 330/413 [01:21<00:20,  4.03it/s, loss=14.5, v_num=0]



Epoch 9: 100%|██████████| 413/413 [01:41<00:00,  4.05it/s, loss=14.5, v_num=0]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:21<00:00,  3.80it/s]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:21<00:00,  3.94it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: VAE_384_logger/VAE_384_384

  | Name    | Type    | Params | In sizes       | Out sizes     
----------------------------------------------------------------------
0 | encoder | Encoder | 496 K  | [2, 3, 32, 32] | [2, 384]      
1 | decoder | Decoder | 496 K  | [2, 384]       | [2, 3, 32, 32]
----------------------------------------------------------------------
992 K     Trainable params
0         Non-trainable params
992 K     Total params
3.971     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                      

Global seed set to 42


Epoch 0:   0%|          | 0/413 [00:00<?, ?it/s] 



Epoch 0:  80%|███████▉  | 329/413 [01:22<00:21,  3.97it/s, loss=60.5, v_num=0]  



Epoch 0:  80%|███████▉  | 330/413 [01:22<00:20,  3.98it/s, loss=60.4, v_num=0]



Epoch 1:   0%|          | 0/413 [00:00<?, ?it/s, loss=60.4, v_num=0]          



Epoch 1:  80%|███████▉  | 330/413 [01:23<00:20,  3.96it/s, loss=42.1, v_num=0]



Epoch 2:   0%|          | 0/413 [00:00<?, ?it/s, loss=42.1, v_num=0]          



Epoch 2:  80%|███████▉  | 330/413 [01:27<00:22,  3.75it/s, loss=31.6, v_num=0]



Epoch 3:   0%|          | 0/413 [00:00<?, ?it/s, loss=31.6, v_num=0]          



Epoch 3:  80%|███████▉  | 330/413 [01:19<00:20,  4.13it/s, loss=26.6, v_num=0]



Epoch 4:   0%|          | 0/413 [00:00<?, ?it/s, loss=26.6, v_num=0]          



Epoch 4:  80%|███████▉  | 330/413 [01:19<00:19,  4.15it/s, loss=24.3, v_num=0]



Epoch 5:   0%|          | 0/413 [00:00<?, ?it/s, loss=24.3, v_num=0]          



Epoch 5:  80%|███████▉  | 330/413 [01:19<00:19,  4.15it/s, loss=21.7, v_num=0]



Epoch 6:   0%|          | 0/413 [00:00<?, ?it/s, loss=21.7, v_num=0]          



Epoch 6:  80%|███████▉  | 330/413 [01:18<00:19,  4.19it/s, loss=19.1, v_num=0]



Epoch 7:   0%|          | 0/413 [00:00<?, ?it/s, loss=19.1, v_num=0]          



Epoch 7:  80%|███████▉  | 330/413 [01:40<00:25,  3.29it/s, loss=17.2, v_num=0]



Epoch 8:   0%|          | 0/413 [00:00<?, ?it/s, loss=17.2, v_num=0]          



Epoch 8:  80%|███████▉  | 330/413 [01:26<00:21,  3.83it/s, loss=15.5, v_num=0]



Epoch 9:   0%|          | 0/413 [00:00<?, ?it/s, loss=15.5, v_num=0]          



Epoch 9:  80%|███████▉  | 330/413 [01:29<00:22,  3.70it/s, loss=15.4, v_num=0]



Epoch 9: 100%|██████████| 413/413 [01:50<00:00,  3.72it/s, loss=15.4, v_num=0]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:23<00:00,  3.61it/s]
Testing: 0it [00:00, ?it/s]



Testing: 100%|██████████| 83/83 [00:20<00:00,  3.98it/s]
