#

/bin/bash: sourc: command not found


In [1]:
## Standard libraries
import os
import numpy as np
import math
from PIL import Image
from functools import partial

## Imports for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## PyTorch
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
## Torchvision
import torchvision
from torchvision.datasets import MNIST
from torchvision import transforms
## PyTorch Lightning
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip3 install pytorch-lightning>=1.4 --quiet
    import pytorch_lightning as pl
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import torch.utils.data as data_utils

In [2]:
# Path to the folder where the datasets are be downloaded (e.g. MNIST)
DATASET_PATH = "project/data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "project/saved_models"

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
class GroupBase(torch.nn.Module):

    def __init__(self, dimension, identity):
        """ Implements a group.

        @param dimension: Dimensionality of the group (number of dimensions in the basis of the algebra).
        @param identity: Identity element of the group.
        """
        super().__init__()
        self.dimension = dimension
        self.register_buffer('identity', torch.Tensor(identity))

    def elements(self):
        """ Obtain a tensor containing all group elements in this group.
        
        """
        raise NotImplementedError()

    def product(self, h, h_prime):
        """ Defines group product on two group elements.

        @param h: Group element 1
        @param h_prime: Group element 2
        """
        raise NotImplementedError()

    def inverse(self, h):
        """ Defines inverse for group element.

        @param h: A group element from subgroup H.
        """
        raise NotImplementedError()

    def left_action_on_R2(self, h, x):
        """ Group action of an element from the subgroup H on a vector in R2.

        @param h: A group element from subgroup H.
        @param x: Vectors in R2.
        """
        raise NotImplementedError()

    def matrix_representation(self, h):
        """ Obtain a matrix representation in R^2 for an element h.

        @param h: Group element
        """
        raise NotImplementedError()

    def determinant(self, h):
        """ Calculate the determinant of the representation of a group element
        h.

        @param g:
        """
        raise NotImplementedError()
    
    def normalize_group_parameterization(self, h):
        """ Map the group elements to an interval [-1, 1]. We use this to create
        a standardized input for obtaining weights over the group.

        @param g:
        """
        raise NotImplementedError()

class GroupKernelBase(torch.nn.Module):

    def __init__(self, group, kernel_size, in_channels, out_channels):
        """ Implements base class for the group convolution kernel. Stores grid
        defined over the group R^2 \rtimes H and it's transformed copies under
        all elements of the group H.
        
        """
        super().__init__()
        self.group = group

        self.kernel_size = kernel_size
        self.in_channels = in_channels
        self.out_channels = out_channels

        # Create a spatial kernel grid
        self.register_buffer("grid_R2", torch.stack(torch.meshgrid(
            torch.linspace(-1., 1., self.kernel_size),
            torch.linspace(-1., 1., self.kernel_size),
            indexing='ij'
        )).to(self.group.identity.device))

        # The kernel grid now also extends over the group H, as our input 
        # feature maps contain an additional group dimension
        self.register_buffer("grid_H", self.group.elements())
        self.register_buffer("transformed_grid_R2xH", self.create_transformed_grid_R2xH())

    def create_transformed_grid_R2xH(self):
        """Transform the created grid over R^2 \rtimes H by the group action of 
        each group element in H.
        
        This yields a set of grids over the group. In other words, a list of 
        grids, each index of which is the original grid over G transformed by
        a corresponding group element in H.
        """
        # Sample the group H.
        
        ## YOUR CODE STARTS HERE ##
        group_elements = self.group.elements()
        ## AND ENDS HERE ##

        # Transform the grid defined over R2 with the sampled group elements.
        # We again would like to end up with a grid of shape [2, |H|, kernel_size, kernel_size].
        
        ## YOUR CODE STARTS HERE ##
        transformed_grid_R2 = []
        for g_inverse in self.group.inverse(group_elements):
            transformed_grid_R2.append(
                self.group.left_action_on_R2(g_inverse, self.grid_R2)
            )
        transformed_grid_R2 = torch.stack(transformed_grid_R2, dim=1)
        ## AND ENDS HERE ##

        # Transform the grid defined over H with the sampled group elements. We want a grid of 
        # shape [|H|, |H|]. Make sure to stack the transformed like above (over the 1st dim).

        ## YOUR CODE STARTS HERE ##
        transformed_grid_H = []
        for g_inverse in self.group.inverse(group_elements):
            transformed_grid_H.append(
                self.group.product(
                    g_inverse, self.grid_H
                )
            )
        transformed_grid_H = torch.stack(transformed_grid_H, dim=1)
        ## AND ENDS HERE ##

        # Rescale values to between -1 and 1, we do this to please the torch
        # grid_sample function.
        transformed_grid_H = self.group.normalize_group_elements(transformed_grid_H)

        # Create a combined grid as the product of the grids over R2 and H
        # repeat R2 along the group dimension, and repeat H along the spatial dimension
        # to create a [3, |H|, |H|, kernel_size, kernel_size] grid
        transformed_grid = torch.cat(
            (
                transformed_grid_R2.view(
                    2,
                    group_elements.numel(),
                    1,
                    self.kernel_size,
                    self.kernel_size,
                ).repeat(1, 1, group_elements.numel(), 1, 1),
                transformed_grid_H.view(
                    1,
                    group_elements.numel(),
                    group_elements.numel(),
                    1,
                    1,
                ).repeat(1, 1, 1, self.kernel_size, self.kernel_size)
            ),
            dim=0
        )
        return transformed_grid


    def sample(self, sampled_group_elements):
        """ Sample convolution kernels for a given number of group elements

        arguments should include:
        :param sampled_group_elements: the group elements over which to sample
            the convolution kernels

        should return:
        :return kernels: filter bank extending over all input channels, 
            containing kernels transformed for all output group elements.
        """
        raise NotImplementedError()


class InterpolativeGroupKernel(GroupKernelBase):

    def __init__(self, group, kernel_size, in_channels, out_channels):
        super().__init__(group, kernel_size, in_channels, out_channels)

        # create and initialise a set of weights, we will interpolate these
        # to create our transformed spatial kernels. Note that our weight
        # now also extends over the group H.

        ## YOUR CODE STARTS HERE ##
        self.weight = torch.nn.Parameter(torch.zeros((
            self.out_channels,
            self.in_channels,
            self.group.elements().numel(), # this is different from the lifting convolution
            self.kernel_size,
            self.kernel_size
        ), device=self.group.identity.device))
        ## AND ENDS HERE ##

        # initialize weights using kaiming uniform intialisation.
        torch.nn.init.kaiming_uniform_(self.weight.data, a=math.sqrt(5))
    
    def sample(self):
        """ Sample convolution kernels for a given number of group elements

        should return:
        :return kernels: filter bank extending over all input channels, 
            containing kernels transformed for all output group elements.
        """
        # First, we fold the output channel dim into the input channel dim; 
        # this allows us to transform the entire filter bank in one go using the
        # interpolation function.
       
        ## YOUR CODE STARTS HERE ##
        weight = self.weight.view(
            self.out_channels * self.in_channels,
            self.group.elements().numel(),
            self.kernel_size,
            self.kernel_size
        )
        ## AND ENDS HERE ## 
        
        transformed_weight = []
        # We loop over all group elements and retrieve weight values for
        # the corresponding transformed grids over R2xH.
        for grid_idx in range(self.group.elements().numel()):
            transformed_weight.append(
                trilinear_interpolation(weight, self.transformed_grid_R2xH[:, grid_idx, :, :, :])
            )
        transformed_weight = torch.stack(transformed_weight)
        
        # Separate input and output channels.
        transformed_weight = transformed_weight.view(
            self.group.elements().numel(),
            self.out_channels,
            self.in_channels,
            self.group.elements().numel(),
            self.kernel_size,
            self.kernel_size
        )

        # Put out channel dimension before group dimension. We do this
        # to be able to use pytorched Conv2D. Details below!
        transformed_weight = transformed_weight.transpose(0, 1)
        
        return transformed_weight

class CyclicGroup(GroupBase):

    def __init__(self, order):
        super().__init__(
            dimension=1,
            identity=[0.]
        )

        assert order > 1
        self.order = torch.tensor(order)

    def elements(self):
        """ Obtain a tensor containing all group elements in this group.
        
        @returns elements: Tensor containing group elements of shape [self.order]
        """
        return torch.linspace(
            start=0,
            end=2 * np.pi * float(self.order - 1) / float(self.order),
            steps=self.order,
            device=self.identity.device
        )
    
    def product(self, h, h_prime):
        """ Defines group product on two group elements of the cyclic group C4.

        @param h: Group element 1
        @param h_prime: Group element 2
        
        @returns product: Tensor containing h \cdot h_prime with \cdot the group action.
        """
        # As we directly parameterize the group by its rotation angles, this 
        # will be a simple addition. Don't forget the closure property though!

        ## YOUR CODE STARTS HERE ##
        product = torch.remainder(h + h_prime, 2 * np.pi)
        ## AND ENDS HERE ##

        return product

    def inverse(self, h):
        """ Defines group inverse for an element of the cyclic group C4.

        @param h: Group element
        
        @returns inverse: Tensor containing h^{-1}.
        """
        # Implement the inverse operation. Keep the closure property in mind!

        ## YOUR CODE STARTS HERE ##
        inverse = torch.remainder(-h, 2 * np.pi)
        ## AND ENDS HERE ##

        return inverse
    
    def left_action_on_R2(self, h, x):
        """ Group action of an element from the subgroup H on a vector in R2.

        @param h: A group element from subgroup H.
        @param x: Vectors in R2.
        
        @returns transformed_x: Tensor containing \rho(h)x.
        """
        # Transform the vector x with h, recall that we are working with a left-regular representation, 
        # meaning we transform vectors in R^2 through left-matrix multiplication.
        transformed_x = torch.tensordot(self.matrix_representation(h), x, dims=1)       
        return transformed_x

    def matrix_representation(self, h):
        """ Obtain a matrix representation in R^2 for an element h.

        @param h: A group element.
        
        @returns representation: Tensor containing matrix representation of h, shape [2, 2].
        """
        ## YOUR CODE STARTS HERE ##
        cos_t = torch.cos(h)
        sin_t = torch.sin(h)

        representation = torch.tensor([
            [cos_t, -sin_t],
            [sin_t, cos_t]
        ], device=self.identity.device)
        ## AND ENDS HERE ##

        return representation
    
    def normalize_group_elements(self, h):
        """ Normalize values of group elements to range between -1 and 1.
        The group elements range from 0 to 2pi * (self.order - 1) / self.order,
        so we normalize accordingly.

        @param h: A group element.
        @return normalized_h: Tensor containing normalized value corresponding to element h.
        """
        largest_elem = 2 * np.pi * (self.order - 1) / self.order
        normalized_h = (2*h / largest_elem) - 1.
        return normalized_h

def bilinear_interpolation(signal, grid):
    """ Obtain signal values for a set of gridpoints through bilinear interpolation.
    
    @param signal: Tensor containing pixel values [C, H, W] or [N, C, H, W]
    @param grid: Tensor containing coordinate values [2, H, W] or [2, N, H, W]
    """
    # If signal or grid is a 3D array, add a dimension to support grid_sample.
    if len(signal.shape) == 3:
        signal = signal.unsqueeze(0)
    if len(grid.shape) == 3:
        grid = grid.unsqueeze(1)
    
    # Grid_sample expects [N, H, W, 2] instead of [2, N, H, W]
    grid = grid.permute(1, 2, 3, 0)
    
    # Grid sample expects YX instead of XY.
    grid = torch.roll(grid, shifts=1, dims=-1)
    
    return torch.nn.functional.grid_sample(
        signal,
        grid,
        padding_mode='zeros',
        align_corners=True,
        mode="bilinear"
    )

def trilinear_interpolation(signal, grid):
    """ 
    
    @param signal: Tensor containing pixel values [C, D, H, W] or [N, C, D, H, W]
    @param grid: Tensor containing coordinate values [3, D, H, W] or [3, N, D, H, W]
    """
    # If signal or grid is a 4D array, add a dimension to support grid_sample.
    if len(signal.shape) == 4:
        signal = signal.unsqueeze(0)
    if len(grid.shape) == 4:
        grid = grid.unsqueeze(1)

    # Grid_sample expects [N, D, H, W, 3] instead of [3, N, D, H, W]
    grid = grid.permute(1, 2, 3, 4, 0)
    
    # Grid sample expects YX instead of XY.
    grid = torch.roll(grid, shifts=1, dims=-1)
    
    return torch.nn.functional.grid_sample(
        signal, 
        grid,
        padding_mode='zeros',
        align_corners=True,
        mode="bilinear" # actually trilinear in this case...
    )


class LiftingKernelBase(torch.nn.Module):
    
    def __init__(self, group, kernel_size, in_channels, out_channels):
        """ Implements a base class for the lifting kernel. Stores the R^2 grid
        over which the lifting kernel is defined and it's transformed copies
        under the action of a group H.
        
        """
        super().__init__()
        self.group = group

        self.kernel_size = kernel_size
        self.in_channels = in_channels
        self.out_channels = out_channels

        # Create spatial kernel grid. These are the coordinates on which our
        # kernel weights are defined.
        self.register_buffer("grid_R2", torch.stack(torch.meshgrid(
            torch.linspace(-1., 1., self.kernel_size),
            torch.linspace(-1., 1., self.kernel_size),
            indexing='ij'
        )).to(self.group.identity.device))

        # Transform the grid by the elements in this group.
        self.register_buffer("transformed_grid_R2", self.create_transformed_grid_R2())

    def create_transformed_grid_R2(self):
        """Transform the created grid by the group action of each group element.
        This yields a grid (over H) of spatial grids (over R2). In other words,
        a list of grids, each index of which is the original spatial grid transformed by
        a corresponding group element in H.
        
        """
        # Obtain all group elements.

        ## YOUR CODE STARTS HERE ##
        group_elements = self.group.elements()
        ## AND ENDS HERE ##

        # Transform the grid defined over R2 with the sampled group elements.
        # Recall how the left-regular representation acts on the domain of a 
        # function on R2! (Hint: look closely at the equation given under 1.3)
        # We'd like to end up with a grid of shape [2, |H|, kernel_size, kernel_size].

        ## YOUR CODE STARTS HERE ##
        transformed_grids = []
        for element in self.group.inverse(group_elements):
            transformed_grids.append(
                self.group.left_action_on_R2(element, self.grid_R2)
            )
        transformed_grid = torch.stack(transformed_grids, dim=1)
        ## AND ENDS HERE ##

        return transformed_grid


    def sample(self, sampled_group_elements):
        """ Sample convolution kernels for a given number of group elements

        arguments should include:
        :param sampled_group_elements: the group elements over which to sample
            the convolution kernels

        should return:
        :return kernels: filter bank extending over all input channels, 
            containing kernels transformed for all output group elements.
        """
        raise NotImplementedError()

class InterpolativeLiftingKernel(LiftingKernelBase):

    def __init__(self, group, kernel_size, in_channels, out_channels):
        super().__init__(group, kernel_size, in_channels, out_channels)

        # Create and initialise a set of weights, we will interpolate these
        # to create our transformed spatial kernels.
        self.weight = torch.nn.Parameter(torch.zeros((
            self.out_channels,
            self.in_channels,
            self.kernel_size,
            self.kernel_size
        ), device=self.group.identity.device))

        # Initialize weights using kaiming uniform intialisation.
        torch.nn.init.kaiming_uniform_(self.weight.data, a=math.sqrt(5))
    
    def sample(self):
        """ Sample convolution kernels for a given number of group elements

        should return:
        :return kernels: filter bank extending over all input channels, 
            containing kernels transformed for all output group elements.
        """
        # First, we fold the output channel dim into the input channel dim; 
        # this allows us to transform the entire filter bank in one go using the
        # torch grid_sample function.

        ## YOUR CODE STARTS HERE ##
        weight = self.weight.view(
            self.out_channels * self.in_channels,
            self.kernel_size,
            self.kernel_size
        )
        ## AND ENDS HERE ##

        # Sample the transformed kernels.
        transformed_weight = []
        for spatial_grid_idx in range(self.group.elements().numel()):
            transformed_weight.append(
                bilinear_interpolation(weight, self.transformed_grid_R2[:, spatial_grid_idx, :, :])
            )
        transformed_weight = torch.stack(transformed_weight)
            
        # Separate input and output channels.
        transformed_weight = transformed_weight.view(
            self.group.elements().numel(),
            self.out_channels,
            self.in_channels,
            self.kernel_size,
            self.kernel_size
        )
        
        # Put out channel dimension before group dimension. We do this
        # to be able to use pytorched Conv2D. Details below!
        transformed_weight = transformed_weight.transpose(0, 1)

        return transformed_weight


class GroupConvolution(torch.nn.Module):

    def __init__(self, group, in_channels, out_channels, kernel_size, padding):
        super().__init__()

        self.kernel = InterpolativeGroupKernel(
            group=group,
            kernel_size=kernel_size,
            in_channels=in_channels,
            out_channels=out_channels
        )
        
        self.padding = padding
        

    def forward(self, x):
        """ Perform lifting convolution

        @param x: Input sample [batch_dim, in_channels, group_dim, spatial_dim_1, 
            spatial_dim_2]
        @return: Function on a homogeneous space of the group
            [batch_dim, out_channels, num_group_elements, spatial_dim_1, 
            spatial_dim_2]
        """

        # We now fold the group dimensions of our input into the input channel
        # dimension.

        ## YOUR CODE STARTS HERE ##
        x = x.reshape(
            -1,
            x.shape[1] * x.shape[2],
            x.shape[3],
            x.shape[4]
        )
        ## AND ENDS HERE ##

        # We obtain convolution kernels transformed under the group.

        ## YOUR CODE STARTS HERE ##
        conv_kernels = self.kernel.sample()
        ## AND ENDS HERE ##

        # Apply group convolution, note that the reshape folds the 'output' group 
        # dimension of the kernel into the output channel dimension, and the 
        # 'input' group dimension into the input channel dimension.

        # Question: Do you see why we (can) do this?

        ## YOUR CODE STARTS HERE ##
        x = torch.nn.functional.conv2d(
            input=x,
            weight=conv_kernels.reshape(
                self.kernel.out_channels * self.kernel.group.elements().numel(),
                self.kernel.in_channels * self.kernel.group.elements().numel(),
                self.kernel.kernel_size,
                self.kernel.kernel_size
            ),
            padding=self.padding
        )
        ## AND ENDS HERE ##

        # Reshape [batch_dim, in_channels * num_group_elements, spatial_dim_1, 
        # spatial_dim_2] into [batch_dim, in_channels, num_group_elements, 
        # spatial_dim_1, spatial_dim_2], separating channel and group 
        # dimensions.
        x = x.view(
            -1,
            self.kernel.out_channels,
            self.kernel.group.elements().numel(),
            x.shape[-1],
            x.shape[-2],
        )

        return x

class LiftingConvolution(torch.nn.Module):

    def __init__(self, group, in_channels, out_channels, kernel_size, padding):
        super().__init__()

        self.kernel = InterpolativeLiftingKernel(
            group=group,
            kernel_size=kernel_size,
            in_channels=in_channels,
            out_channels=out_channels
        )
        
        self.padding = padding

    def forward(self, x):
        """ Perform lifting convolution

        @param x: Input sample [batch_dim, in_channels, spatial_dim_1, 
            spatial_dim_2]
        @return: Function on a homogeneous space of the group
            [batch_dim, out_channels, num_group_elements, spatial_dim_1, 
            spatial_dim_2]
        """

        # Obtain convolution kernels transformed under the group.
        
        ## YOUR CODE STARTS HERE ##
        conv_kernels = self.kernel.sample()
        ## AND ENDS HERE ##

        # Apply lifting convolution. Note that using a reshape we can fold the
        # group dimension of the kernel into the output channel dimension. We 
        # treat every transformed kernel as an additional output channel. This
        # way we can use pytorch's conv2d function!

        # Question: Do you see why we (can) do this?

        ## YOUR CODE STARTS HERE ##
        x = torch.nn.functional.conv2d(
            input=x,
            weight=conv_kernels.reshape(
                self.kernel.out_channels * self.kernel.group.elements().numel(),
                self.kernel.in_channels,
                self.kernel.kernel_size,
                self.kernel.kernel_size
            ),
            padding=self.padding
        )
        ## AND ENDS HERE ##

        # Reshape [batch_dim, in_channels * num_group_elements, spatial_dim_1, 
        # spatial_dim_2] into [batch_dim, in_channels, num_group_elements, 
        # spatial_dim_1, spatial_dim_2], separating channel and group 
        # dimensions.
        x = x.view(
            -1,
            self.kernel.out_channels,
            self.kernel.group.elements().numel(),
            x.shape[-1],
            x.shape[-2]
        )

        return x

# Group Convolution Neural Network Model

In [4]:
from torch.nn import AdaptiveAvgPool3d
class GroupEquivariantCNN(torch.nn.Module):

    def __init__(self, group, in_channels, out_channels, kernel_size, num_hidden, hidden_channels):
        super().__init__()

        # Create the lifing convolution.

        self.lifting_conv = LiftingConvolution(
            group=group,
            in_channels=in_channels,
            out_channels=hidden_channels,
            kernel_size=kernel_size,
            padding=0
        )

        # Create a set of group convolutions.
        self.gconvs = torch.nn.ModuleList()
        
        for i in range(num_hidden):
            self.gconvs.append(
                GroupConvolution(
                    group=group,
                    in_channels=hidden_channels,
                    out_channels=hidden_channels,
                    kernel_size=kernel_size,
                    padding=0
                )
            )

        # Create the projection layer. 
        
        self.projection_layer = torch.nn.AdaptiveAvgPool3d(1)
        
        # And a final linear layer for classification.
        self.final_linear = torch.nn.Linear(hidden_channels, out_channels)
    
    def forward(self, x):
        
        # Lift and disentangle features in the input.
        x = self.lifting_conv(x)
        x = torch.nn.functional.layer_norm(x, x.shape[-4:])
        x = torch.nn.functional.relu(x)

        # Apply group convolutions.
        for gconv in self.gconvs:
            x = gconv(x)
            x = torch.nn.functional.layer_norm(x, x.shape[-4:])
            x = torch.nn.functional.relu(x)
        
        # to ensure equivariance, apply max pooling over group and spatial dims.
        x = self.projection_layer(x).squeeze()

        x = self.final_linear(x)
        return x

In [5]:
# Let's check whether our implementation works correctly. First we inspect the
# shape of our transformed grids to assess whether this is correct.
order = 4
lifting_kernel_base = LiftingKernelBase(
    group=CyclicGroup(order=order),
    kernel_size=7,
    in_channels=3,
    out_channels=1
)

# The grid has a shape of [2, |H|, kernel_size, kernel_size].
assert lifting_kernel_base.transformed_grid_R2.shape == torch.Size([2, 4, 7, 7])

In [6]:
NUM_IMAGES = 4
images = [train_ds[idx][0] for idx in range(NUM_IMAGES)]
orig_images = [Image.fromarray(train_ds.data[idx].numpy()) for idx in range(NUM_IMAGES)]
orig_images = [test_transform(img) for img in orig_images]

img_grid = torchvision.utils.make_grid(torch.stack(images + orig_images, dim=0), nrow=4, normalize=True, pad_value=0.5)
img_grid = img_grid.permute(1, 2, 0)

plt.figure(figsize=(8,8))
plt.title("Images sampled from the MNIST train set, augmented with test transforms.")
plt.imshow(img_grid)
plt.axis('off')
plt.show()
plt.close()

NameError: name 'train_ds' is not defined

DataModule

In [7]:
class DataModule(pl.LightningModule):

    def __init__(self, model_name, model_hparams, optimizer_name, optimizer_hparams):
        """
        Inputs:
            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams - Hyperparameters for the model, as dictionary.
            optimizer_name - Name of the optimizer to use. Currently supported: Adam, SGD
            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        self.model = create_model(model_name, model_hparams)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()

    def forward(self, imgs):
        return self.model(imgs)

    def configure_optimizers(self):
        # AdamW is Adam with a correct implementation of weight decay (see here for details: https://arxiv.org/pdf/1711.05101.pdf)
        optimizer = optim.AdamW(
            self.parameters(), **self.hparams.optimizer_hparams)
        return [optimizer], []

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log('train_acc', acc, on_step=False, on_epoch=True)
        self.log('train_loss', loss)
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # By default logs it per epoch (weighted average over batches)
        # self.log('val_acc', acc, prog_bar=True)
        self.log('val_acc', acc, on_step=False, on_epoch=True)
        self.log('val_loss', loss)

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches), and returns it afterwards
        self.log('test_acc', acc, prog_bar=True)

In [8]:
model_dict = {
    # 'CNN': CNN,
    'GCNN': GroupEquivariantCNN
}

def create_model(model_name, model_hparams):
    if model_name in model_dict:
        return model_dict[model_name](**model_hparams)
    else:
        assert False, f"Unknown model name \"{model_name}\". Available models are: {str(model_dict.keys())}"

In [9]:
def train_model(model_name, logger_name, save_name=None, **kwargs):
    """
    Inputs:
        model_name - Name of the model you want to run. Is used to look up the class in "model_dict"
        save_name (optional) - If specified, this name will be used for creating the checkpoint and logging directory.
    """
    if save_name is None:
        save_name = model_name

    # Create logger
    logger = TensorBoardLogger("tb_logger", name = logger_name)

    # Create a PyTorch Lightning trainer with the generation callback
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name),                          # Where to save models
                         logger=logger,
                         accelerator='auto',                                             # We run on a single GPU (if possible)
                         max_epochs=10,                                                                      # How many epochs to train for if no patience is set
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),  # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
                                    LearningRateMonitor("epoch")])
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, save_name + ".ckpt")
    
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        model = DataModule.load_from_checkpoint(pretrained_filename) # Automatically loads the model with the saved hyperparameters
    else:
        pl.seed_everything(12) # To be reproducable
        model = DataModule(model_name=model_name, **kwargs)
        trainer.fit(model, train_loader, test_loader)
        model = DataModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training

    # Test best model on test set
    val_result = trainer.test(model.to(device), test_loader, verbose=False)
    result = {"val": val_result[0]["test_acc"]}

    return model, result

In [None]:
num_data = 60000
#rotations = [30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360]
rotations = [120, 150, 180, 210, 240, 270, 300, 330, 360]
# rotations = [360]
for ind_rot in rotations:
    
    # We normalize the training data.
    train_transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                      torchvision.transforms.Normalize((0.1307,), (0.3081,))
                                                      ])
    
    # To demonstrate the generalization capabilities our rotation equivariant layers bring, we apply a random
    # rotation between 0 and 360 deg to the test set.
    test_transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                     torchvision.transforms.RandomRotation(
                                                         [0, ind_rot],
                                                         torchvision.transforms.InterpolationMode.BILINEAR,
                                                         fill=0),
                                                     torchvision.transforms.Normalize((0.1307,), (0.3081,))
                                                     ])
    
    test_transform_fixed = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                     torchvision.transforms.RandomRotation(
                                                         [0, 360],
                                                         torchvision.transforms.InterpolationMode.BILINEAR,
                                                         fill=0),
                                                     torchvision.transforms.Normalize((0.1307,), (0.3081,))
                                                     ])
    
    # We demonstrate our models on the MNIST dataset.
    train_ds = torchvision.datasets.MNIST(root=DATASET_PATH, train=True, transform=train_transform, download=True)
    test_ds = torchvision.datasets.MNIST(root=DATASET_PATH, train=False, transform=test_transform)

    # Change number of data 
    # indices = torch.arange(num_data)
    # train_ds_less = data_utils.Subset(train_ds, indices)
    
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=64, shuffle=False)
    
    # Set the random seed for reproducibility.
    pl.seed_everything(62)
    
    gcnn_model, gcnn_results = train_model(model_name="GCNN",
                                           logger_name = "GCNN_" + str(num_data) + "_data_deg" + str(ind_rot),
                                           model_hparams={"in_channels": 1,
                                                          "out_channels": 10,
                                                          "kernel_size": 5,
                                                          "num_hidden": 4,
                                                          "hidden_channels":16, # to account for the increase in trainable parameters due to the extra dimension in our feature maps, remove some hidden channels.
                                                          "group":CyclicGroup(order=4).to(device)},
                                           optimizer_name="Adam",
                                           optimizer_hparams={"lr": 1e-2,
                                                              "weight_decay": 1e-4},
                                           save_name='gcnn-pretrained')

[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:27<00:00, 34.27it/s, v_num=1]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 65.66it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 29.66it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:06, 25.07it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.41it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:07, 21.65it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:07, 21.35it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 19.46it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.15it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 18.05it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 17.83

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:36<00:00, 25.74it/s, v_num=1]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/yu.sea/.conda/envs/pytorch_env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.44it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.84it/s, v_num=1]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 64.88it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 29.67it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:06, 25.15it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.51it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:07, 21.32it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:07, 21.06it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.92it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.56it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 19.37it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 18.52

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:35<00:00, 26.17it/s, v_num=1]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.35it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg180
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.53it/s, v_num=0]         
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 75.14it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 31.37it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 26.25it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 24.10it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 22.87it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:07, 20.31it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:08, 18.53it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:08, 18.31it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 18.47it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 17.5

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:35<00:00, 26.10it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 22.78it/s]

[rank: 0] Seed set to 62





GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg210
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 34.86it/s, v_num=0]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:01, 98.52it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 33.21it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 27.19it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 25.03it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.73it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 22.80it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.35it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.08it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 18.18it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 18.15

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:36<00:00, 25.52it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.12it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg240
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 34.95it/s, v_num=0]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 76.19it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 31.25it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 26.24it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 24.21it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.19it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 21.63it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 21.36it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 21.15it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 20.16it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 20.03

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:36<00:00, 25.67it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 22.93it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg270
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.38it/s, v_num=0]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:01, 101.48it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 33.67it/s] [A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 27.45it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 25.19it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.88it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 22.85it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.35it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 18.76it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 17.83it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 17.

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation DataLoader 0:  44%|████▍     | 69/157 [00:04<00:05, 16.64it/s][A
Validation DataLoader 0:  45%|████▍     | 70/157 [00:04<00:05, 16.66it/s][A
Validation DataLoader 0:  45%|████▌     | 71/157 [00:04<00:05, 16.69it/s][A
Validation DataLoader 0:  46%|████▌     | 72/157 [00:04<00:05, 16.73it/s][A
Validation DataLoader 0:  46%|████▋     | 73/157 [00:04<00:05, 16.76it/s][A
Validation DataLoader 0:  47%|████▋     | 74/157 [00:04<00:04, 16.80it/s][A
Validation DataLoader 0:  48%|████▊     | 75/157 [00:04<00:04, 16.83it/s][A
Validation DataLoader 0:  48%|████▊     | 76/157 [00:04<00:04, 16.73it/s][A
Validation DataLoader 0:  49%|████▉     | 77/157 [00:04<00:04, 16.70it/s][A
Validation DataLoader 0:  50%|████▉     | 78/157 [00:04<00:04, 16.73it/s][A
Validation DataLoader 0:  50%|█████     | 79/157 [00:04<00:04, 16.76it/s][A
Validation DataLoader 0:  51%|█████     | 80/157 [00:04<00:04, 16.80it/s][A
Validation DataLoader 0:  52%|█████▏    | 81/157 [00:04<00:04, 16.83it/s][A

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Validation DataLoader 0:  88%|████████▊ | 138/157 [00:08<00:01, 16.80it/s][A
Validation DataLoader 0:  89%|████████▊ | 139/157 [00:08<00:01, 16.82it/s][A
Validation DataLoader 0:  89%|████████▉ | 140/157 [00:08<00:01, 16.84it/s][A
Validation DataLoader 0:  90%|████████▉ | 141/157 [00:08<00:00, 16.86it/s][A
Validation DataLoader 0:  90%|█████████ | 142/157 [00:08<00:00, 16.87it/s][A
Validation DataLoader 0:  91%|█████████ | 143/157 [00:08<00:00, 16.83it/s][A
Validation DataLoader 0:  92%|█████████▏| 144/157 [00:08<00:00, 16.79it/s][A
Validation DataLoader 0:  92%|█████████▏| 145/157 [00:08<00:00, 16.79it/s][A
Validation DataLoader 0:  93%|█████████▎| 146/157 [00:08<00:00, 16.75it/s][A
Validation DataLoader 0:  94%|█████████▎| 147/157 [00:08<00:00, 16.76it/s][A
Validation DataLoader 0:  94%|█████████▍| 148/157 [00:08<00:00, 16.78it/s][A
Validation DataLoader 0:  95%|█████████▍| 149/157 [00:08<00:00, 16.79it/s][A
Validation DataLoader 0:  96%|█████████▌| 150/157 [00:08<00:00,

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:36<00:00, 25.80it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.09it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg300
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.52it/s, v_num=0]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:01, 85.99it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 32.09it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 26.73it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 24.62it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.51it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 22.85it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.16it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.44it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 18.24it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 18.10

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 2: 100%|██████████| 938/938 [00:26<00:00, 35.79it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:01, 104.58it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 33.82it/s] [A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 27.66it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 25.32it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.96it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 23.04it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:06, 22.06it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:06, 21.71it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 19.96it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 18.95it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 6: 100%|██████████| 938/938 [00:26<00:00, 35.34it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 65.31it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 29.83it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:06, 25.30it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.68it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 22.76it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 22.16it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:06, 21.84it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:06, 21.60it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:06, 21.41it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 19.95it/s][A

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:35<00:00, 26.10it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.53it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg330
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0:   6%|▋         | 60/938 [00:01<00:25, 34.88it/s, v_num=0]         

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.01it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 65.24it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 29.56it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:06, 25.02it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.39it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 22.59it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:07, 20.93it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.72it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 20.56it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 19.49it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 18.91it/s][A

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 4: 100%|██████████| 938/938 [00:26<00:00, 35.71it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 65.51it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 29.67it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:06, 25.10it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.43it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 22.48it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 22.04it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.86it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.61it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:08, 18.35it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:08, 18.19it/s][A

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 9: 100%|██████████| 938/938 [00:26<00:00, 35.06it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:01, 86.37it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:04, 31.96it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 26.43it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 24.36it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 23.24it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:06, 21.71it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:06, 21.44it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 19.76it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 19.02it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 19.02it/s][A

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 938/938 [00:36<00:00, 25.89it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:06<00:00, 23.44it/s]


[rank: 0] Seed set to 62
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12
Missing logger folder: tb_logger/GCNN_60000_data_deg360
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                | Params
----------------------------------------------------
0 | model       | GroupEquivariantCNN | 102 K 
1 | loss_module | CrossEntropyLoss    | 0     
----------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.412     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 938/938 [00:26<00:00, 35.73it/s, v_num=0]        
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/157 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/157 [00:00<00:02, 71.63it/s][A
Validation DataLoader 0:   1%|▏         | 2/157 [00:00<00:05, 30.28it/s][A
Validation DataLoader 0:   2%|▏         | 3/157 [00:00<00:05, 25.80it/s][A
Validation DataLoader 0:   3%|▎         | 4/157 [00:00<00:06, 23.96it/s][A
Validation DataLoader 0:   3%|▎         | 5/157 [00:00<00:06, 22.83it/s][A
Validation DataLoader 0:   4%|▍         | 6/157 [00:00<00:07, 20.68it/s][A
Validation DataLoader 0:   4%|▍         | 7/157 [00:00<00:07, 20.36it/s][A
Validation DataLoader 0:   5%|▌         | 8/157 [00:00<00:07, 20.33it/s][A
Validation DataLoader 0:   6%|▌         | 9/157 [00:00<00:07, 20.17it/s][A
Validation DataLoader 0:   6%|▋         | 10/157 [00:00<00:07, 20.15

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 6:  78%|███████▊  | 736/938 [00:20<00:05, 35.72it/s, v_num=0]

In [None]:
 print(torch.version.cuda)

In [None]:
print("Torch version:",torch.__version__)

In [None]:
!python -V 

In [None]:
!python -c 'import torch; print(torch.cuda.is_available())'

In [None]:
!nvidia-smi

In [None]:
# !module load cuda/11.8

In [None]:
# !which nvcc