The next cell is to check if GPU is available in your environment.

In [2]:
import sys
import torch

print(sys.executable)
print(torch.__file__)
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


/home/bhtran/repo/Super-Resolution/.venv/bin/python
/home/bhtran/repo/Super-Resolution/.venv/lib/python3.12/site-packages/torch/__init__.py
2.9.1+cu126
12.6
True


In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from tqdm.auto import tqdm

### Data loading
We will create a custom dataset class for loading our images. This class will inherit from `torch.utils.data.Dataset` and implement the necessary methods to load and preprocess the images (and maybe data augmentation in the future).

In [4]:
class SuperResolutionDataset(Dataset):
    def __init__(self, npz_file_path):
        """Load tensors from .npz file"""
        data = np.load(npz_file_path)
        self.hr_images = data['hr']
        self.lr_images = data['lr']

    def __len__(self):
        return len(self.hr_images)

    def __getitem__(self, idx):
        #Because the image dimension is (batch, height, width, channels) 
        #So i do a permutation here to (C,H,W) to fit the model
        lr = torch.from_numpy(self.lr_images[idx]).permute(2, 0, 1)
        hr = torch.from_numpy(self.hr_images[idx]).permute(2, 0, 1)

        return lr, hr

training_dataset = SuperResolutionDataset('../data/hr_lr_images.npz')
training_dataloader = DataLoader(training_dataset, batch_size=32, shuffle=True)

### Data Split
Split the dataset into training and validation sets to evaluate model performance during training.

In [5]:
from torch.utils.data import random_split

# Split dataset into training (80%), validation (15%), and test (5%)
train_size = int(0.8 * len(training_dataset))
val_size = int(0.15 * len(training_dataset))
test_size = len(training_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(training_dataset, [train_size, val_size, test_size])

# Create dataloaders for training, validation, and test
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f'Training samples: {len(train_dataset)}')
print(f'Validation samples: {len(val_dataset)}')
print(f'Test samples: {len(test_dataset)}')

Training samples: 13883
Validation samples: 2603
Test samples: 868


In [6]:
import sys
sys.path.insert(0, '..')
from src.model import SuperResolutionModel

The line below just a sanitty check to ensure that our model is working as expected. Sometimes notebooks environments are populated.

In [7]:
model = SuperResolutionModel()
x = torch.randn(1, 3, 32, 32)
y = model(x)
print(type(y))
print(y.shape)

<class 'torch.Tensor'>
torch.Size([1, 3, 128, 128])


### Check your model shape and size
The model should not exceed 5 million parameters.

In [8]:
# device refers to the GPU or the CPU, depending whether GPU is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model = SuperResolutionModel().to(device)

# display information about the model
summary(model, (3,32,32))

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable parameters:', params)
if params > 5_000_000:
    raise Exception('Your model is unecessarily complex, scale down!')

Using device: cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           4,864
              ReLU-2           [-1, 64, 32, 32]               0
            Conv2d-3           [-1, 64, 32, 32]          36,928
              ReLU-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 48, 32, 32]          27,696
      PixelShuffle-6          [-1, 3, 128, 128]               0
Total params: 69,488
Trainable params: 69,488
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 2.75
Params size (MB): 0.27
Estimated Total Size (MB): 3.03
----------------------------------------------------------------
Trainable parameters: 69488
----------------------------------------------------------------
        Layer (type)               Output Shape         Para

### Plot setup and Hyperparameters 

In [9]:
# Custom callback for plotting loss and accuracy during training
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from IPython.display import clear_output


class PlotLogAccuracy:

  def __init__(self):
    self.epochs = []
    self.train_losses = []
    self.val_losses = []
    self.train_acc = []
    self.val_acc = []
    self.epoch_count = 0

  def update(self, train_loss, train_acc, val_loss, val_acc):
    self.epochs.append(self.epoch_count)
    self.train_losses.append(train_loss)
    self.val_losses.append(val_loss)
    self.train_acc.append(train_acc)
    self.val_acc.append(val_acc)
    self.epoch_count += 1

    clear_output(wait=True)
    plt.figure(figsize=(16, 6))
    plt.subplot(121)
    plt.plot(self.epochs, self.train_losses, label="train loss")
    plt.plot(self.epochs, self.val_losses, label="validation loss")
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.title('Model Loss')
    plt.legend()
    plt.subplot(122)
    plt.plot(self.epochs, self.train_acc, label="training accuracy")
    plt.plot(self.epochs, self.val_acc, label="validation accuracy")
    plt.legend()
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.title('Model Accuracy')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show();

plotter = PlotLogAccuracy()

In [None]:
epochs = 10
batch_size = 64
learning_rate = 0.005
momentum = 0.9

criterion = nn.MSELoss()

# setting the model (calling this again will reset the weights)
model = SuperResolutionModel().to(device)

# play with this
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=1e-4)

trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                         num_workers=0, pin_memory=torch.cuda.is_available())
valloader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                         num_workers=0, pin_memory=torch.cuda.is_available())
testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         num_workers=0, pin_memory=torch.cuda.is_available())

train_loss_hist, val_loss_hist = [], []

print('Setup complete.')

Setup complete.


In [None]:
print('Starting training...')

# Epoch Loop: It iterates through the specified number of training epochs.

for epoch in range(epochs):

    # For each epoch, all data in the dataset is processed.

    model.train() # Sets the model to training mode

    running_loss = 0.0 # resetting loss metric

    #  Iterates through the `trainloader` to get mini-batches of data
    for x_batch, y_batch in tqdm(trainloader, desc=f'Epoch {epoch+1}/{epochs} [Train]', leave=False):

        # copying data to the GPU (the `device`) if GPU is available
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()           # setting up gradient to zero
        out = model(x_batch)            # Performs the forward pass
        loss = criterion(out, y_batch)  # calculates the loss
        loss.backward()                 # performs backpropagation in parallel on the batch
        optimizer.step()                # optimizer steps to update the model's weights

        # updating current training loss on the mini-batch
        running_loss += loss.item() * x_batch.size(0)

    train_loss = running_loss / len(trainloader.dataset)

    # Computing Validation Loss

    model.eval() # switching model to eval mode, disabling dropout/batchnorm/other custom modules
    val_running_loss = 0.0
    with torch.no_grad(): # we disable gradient computation to save some memory
        for x_batch, y_batch in tqdm(valloader, desc=f'Epoch {epoch+1}/{epochs} [Val]', leave=False):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            out = model(x_batch)
            loss = criterion(out, y_batch)
            val_running_loss += loss.item() * x_batch.size(0)
    val_loss = val_running_loss / len(valloader.dataset)

    # scheduler.step()

    # Record training loss
    train_loss_hist.append(train_loss); val_loss_hist.append(val_loss)

    # Print loss for training/validation
    print(f'Epoch {epoch+1}: Train Loss {train_loss:.4f} | Val Loss {val_loss:.4f}')

print('Training finished.')


Starting training...


Epoch 1/10 [Train]:   0%|          | 0/217 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (64) must match the size of tensor b (3) at non-singleton dimension 1