In [1]:
import torch
import torch.nn as nn


def noise_images(x, t):
    beta = torch.linspace(1e-4, 0.02, 1000)
    alpha = 1. - beta
    alpha_hat = torch.cumprod(alpha, dim=0)

    sqrt_alpha_hat = torch.sqrt(alpha_hat[t])[:, None, None, None]
    sqrt_one_minus_alpha_hat = torch.sqrt(1. - alpha_hat[t])[:, None, None, None]

    epsilon = torch.rand_like(x)
    return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * epsilon, epsilon


def pos_encoding(t, channels):
    even_inv_freq = 1.0 / (10000**(torch.arange(0, channels, 2).float() / channels))
    odd_inv_freq = 1.0 / (10000**(torch.arange(1, channels, 2).float() / channels))

    pos_even_enc_a = torch.sin(t.repeat(1, channels // 2) * even_inv_freq)
    pos_odd_enc_b = torch.cos(t.repeat(1, channels // 2) * odd_inv_freq)

    pos_enc = torch.cat([pos_even_enc_a, pos_odd_enc_b], dim=-1)
    return pos_enc


In [10]:
t = torch.randint(low=1, high=1000, size=(5,))
images = torch.randn(5, 64, 64)
x_t, noise = noise_images(images, t)
# x_t shape is (5,5,64,64)
# noise shape is (5,64,64)

In [11]:
time_dim, emb_dim = 256, 256
out_channels = 128
t = t.unsqueeze(-1).type(torch.float)
t = pos_encoding(t, time_dim) # (5,256)

In [17]:
emb_layer = nn.Sequential(nn.SiLU(),nn.Linear(emb_dim, out_channels))
emb_layer(t)

torch.Size([5, 128])

In [16]:
x_t.shape

torch.Size([5, 5, 64, 64])

In [19]:
channels = 5
size = 16

x_t.view(-1, channels, size * size).swapaxes(1,2).shape
# 

torch.Size([80, 256, 5])

In [21]:
layer = nn.LayerNorm([5])
x = torch.randn(5, 5, 64, 64)
k = torch.randn(80,256,5)
layer(k).shape
self_mha = nn.MultiheadAttention(5, 5, batch_first=True)
self_mha(k,k,k)[0] + k

ff_self = nn.Sequential(
    nn.LayerNorm([5]),
    nn.Linear(5, 5),
    nn.GELU(),
    nn.Linear(5, 5)
)

In [28]:
# ff_self(self_mha(k,k,k)[0] + k).shape
# 
torch.randn(80,256,5).swapaxes(2, 1).view(-1, 5, 16, 16).shape

torch.Size([80, 5, 16, 16])

In [32]:
all_images_in_this_epoch = torch.tensor(())

In [31]:
image1 = images[0]
image2 = images[1]
image2.shape

torch.Size([64, 64])

In [33]:
all_images_in_this_epoch = torch.cat((all_images_in_this_epoch, images), 0)
all_images_in_this_epoch

tensor([[[ 1.2888,  0.1140, -0.6094,  ..., -0.3752,  0.6831,  1.4313],
         [-1.7713, -0.4711, -0.8615,  ...,  0.6954,  0.1747,  0.4515],
         [-1.9272, -1.3402,  0.0332,  ..., -0.1877,  1.4132,  1.6464],
         ...,
         [ 1.1612,  1.2506,  0.5571,  ..., -0.2541, -0.2183, -0.1054],
         [ 1.2593,  1.2331,  0.4734,  ..., -0.7725,  0.1185,  0.7151],
         [-0.9877,  1.6190, -0.4728,  ..., -2.0086,  0.4671,  2.1848]],

        [[ 0.1141,  0.6522,  0.6613,  ..., -1.0417,  0.0193,  0.6993],
         [ 0.8225,  1.3465,  0.6460,  ..., -1.8531,  1.6483, -0.4997],
         [-0.9926, -0.6115, -0.5745,  ..., -0.3355, -0.1471, -0.8726],
         ...,
         [-0.5540,  0.4068,  1.1737,  ..., -0.4305, -1.6232,  0.9602],
         [ 0.3567,  1.0157,  0.5524,  ...,  0.1461,  1.5089,  0.1806],
         [ 0.3242,  0.8523, -0.2997,  ...,  1.0955,  0.6705,  0.0528]],

        [[-0.3678, -0.2385,  1.3402,  ...,  1.2366, -0.6441, -1.3400],
         [ 0.2337,  0.9571, -1.3031,  ...,  0

In [30]:
all_images_in_this_epoch = torch.cat((all_images_in_this_epoch, image2), -2)
all_images_in_this_epoch

tensor([[ 1.2888,  0.1140, -0.6094,  ..., -0.3752,  0.6831,  1.4313],
        [-1.7713, -0.4711, -0.8615,  ...,  0.6954,  0.1747,  0.4515],
        [-1.9272, -1.3402,  0.0332,  ..., -0.1877,  1.4132,  1.6464],
        ...,
        [-0.5540,  0.4068,  1.1737,  ..., -0.4305, -1.6232,  0.9602],
        [ 0.3567,  1.0157,  0.5524,  ...,  0.1461,  1.5089,  0.1806],
        [ 0.3242,  0.8523, -0.2997,  ...,  1.0955,  0.6705,  0.0528]])

In [1]:
from datasets import load_dataset
ds = load_dataset("uoft-cs/cifar10")
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 10000
    })
})

In [None]:
next(iter(DataLoader(ds['train'], batch_size=12, shuffle=True)))

In [62]:
import torchvision.transforms as transforms

imggg = ds['train']['img'][0]
transform = transforms.Compose([transforms.ToTensor()])
tensor = transform(imggg)
tensor.shape

torch.Size([3, 32, 32])

In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(ds['train'], batch_size=12, shuffle=True)
for train_features in loader:
    print(train_features)
    # print(train_labels)

In [2]:
import torch
# import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torchvision.transforms as transforms

transform = transforms.Compose([transforms.ToTensor()])

tensor_x = torch.stack(list(map(lambda PILimg: transform(PILimg), ds['train']['img'])))
tensor_y = torch.Tensor(ds['train']['label'])

my_dataset = TensorDataset(tensor_x, tensor_y) # create your datset
my_dataloader = DataLoader(my_dataset)

In [4]:
for train_features, train_labels in my_dataloader:
    print(train_features)
    print(train_labels)
    break

tensor([[[[0.6980, 0.6980, 0.6980,  ..., 0.6667, 0.6588, 0.6471],
          [0.7059, 0.7020, 0.7059,  ..., 0.6784, 0.6706, 0.6588],
          [0.6941, 0.6941, 0.6980,  ..., 0.6706, 0.6627, 0.6549],
          ...,
          [0.4392, 0.4431, 0.4471,  ..., 0.3922, 0.3843, 0.3961],
          [0.4392, 0.4392, 0.4431,  ..., 0.4000, 0.4000, 0.4000],
          [0.4039, 0.3922, 0.4039,  ..., 0.3608, 0.3647, 0.3569]],

         [[0.6902, 0.6902, 0.6902,  ..., 0.6588, 0.6510, 0.6392],
          [0.6980, 0.6941, 0.6980,  ..., 0.6706, 0.6627, 0.6510],
          [0.6863, 0.6863, 0.6902,  ..., 0.6627, 0.6549, 0.6471],
          ...,
          [0.4196, 0.4275, 0.4314,  ..., 0.3804, 0.3686, 0.3725],
          [0.4000, 0.4039, 0.4039,  ..., 0.3725, 0.3647, 0.3608],
          [0.3765, 0.3647, 0.3725,  ..., 0.3294, 0.3373, 0.3294]],

         [[0.7412, 0.7412, 0.7412,  ..., 0.7059, 0.6941, 0.6824],
          [0.7490, 0.7451, 0.7490,  ..., 0.7137, 0.7059, 0.6941],
          [0.7373, 0.7373, 0.7412,  ..., 0

In [2]:
import torch
torch.cuda.is_available()

False

In [6]:
import torch.nn as nn
from torch.nn import functional as F
import torch

class DoubleConv(nn.Module):
    '''
    Used in initial convolution of images as well as 
    at the bottom of the the U shape net.
    '''
    def __init__(self, in_channels, out_channels, mid_channels=None, residual=False):

        super().__init__()
        self.residual = residual
        self.normalizing_group_num = 1
        if not mid_channels:
            mid_channels = out_channels
        
        ## The following layers are applied sequentially in the
        ## defined order as follows.
        self.double_conv = nn.Sequential(

            ## Weights of conv are learned through backpropagation
            ## We are transforming the channels number from in_channels to mid_channels
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),

            ## We normalize the values group wise by separating the channels
            ## the group number. In this case it is 1. Output's shape
            ## remains unchanged from the input shape.
            nn.GroupNorm(self.normalizing_group_num, mid_channels),

            ## Apply Gaussian error linear units
            nn.GELU(),

            ## We are transforming the channels number from mid_channels to out_channels
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),

            ## Perform group normalization once again
            ## n×c×w×h -> swh×gn where c = sg
            nn.GroupNorm(self.normalizing_group_num, out_channels)
        )
        

    def forward(self, x):
        ## We apply gelu when we have skip connections to the UNet decoder
        if self.residual:
            return F.gelu(x + self.double_conv(x))
        return self.double_conv(x) 


model = DoubleConv(10,20)

In [9]:
for p in model.parameters():
    p.detach().zero_()
    print(p)

Parameter containing:
tensor([[[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         ...,

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         ...,

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
        

In [8]:
import copy
class EMA:
    def __init__(self, beta):
        self.beta = beta
        self.step = 0

    def update_model_average(self, ema_model, model):
        for curr_param, ema_param in zip(model.parameters(), ema_model.parameters()):
            old_weight, new_weight = ema_param.data, curr_param.data
            ema_param.data = self.update_average(old_weight, new_weight)
            
    ## We have a apply the smoothing where the EMA model
    ## parameters will be updated by the moving average
    ## of the old and new weights.
    def update_average(self, old_weight, new_weight):
        return old_weight * self.beta + (1 - self.beta) * new_weight
    

    def reset_parameters(self, ema_model, model):
        ema_model.load_state_dict(model.state_dict())


    ## At the start, even as we train the non ema model, 
    ## we just allow the ema model take copy the learnt weights
    ## from the non ema model. We don't want to over average early
    ## as this overemphasizes the the early stage weights.
    ## For simplicity, we start the averaging after 2000 steps
    ## of non ema model training (assumes that our data and 
    ## no. of epochs will exceed and best be more 2x more than 2000)
    def step_ema(self, ema_model, model, step_start_ema=2000):
        if self.step < step_start_ema:
            self.reset_parameters(ema_model, model)
            self.step += 1
            return
        self.update_model_average(ema_model, model)
        self.step += 1

ema = EMA(beta=0.995)
ema_model = copy.deepcopy(model).eval().requires_grad_(False)

AttributeError: 'DoubleConv' object has no attribute 'requires_grad'

In [10]:
copy.deepcopy(model).eval()

DoubleConv(
  (double_conv): Sequential(
    (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): GroupNorm(1, 20, eps=1e-05, affine=True)
    (2): GELU(approximate='none')
    (3): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): GroupNorm(1, 20, eps=1e-05, affine=True)
  )
)