## Implementing Layer Scale

This is the first building block that we'll create in order to build up to a self-attention block.

If you're wondering why PE uses a Layer Scale read this paper [Going Deeper with Image Transformers](https://arxiv.org/pdf/2103.17239)

In [21]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
import inspect # We'll store the actual code .py files instead of the notebook.

import torch
import tvm
import numpy as np

from pe import LayerScale
from compile import compile
from utils import select_params

PE_SPATIAL = '/home/jq/Storage/Model-Weights/HuggingFace-Cache/PE-Spatial-G14-448.pt'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
device = tvm.device('cuda', 0)
param_name = 'transformer.resblocks.42.ls_1.gamma'
state_dict = torch.load(PE_SPATIAL)

params = select_params(param_name, state_dict).numpy()
mod = LayerScale(params.shape[0])
mod, params

print(f"Selected {param_name}: {params.shape}")

mod, packed_params = mod.export_tvm(spec=mod.get_default_spec())
vm = compile(mod, device)

Selected transformer.resblocks.42.ls_1.gamma: (1536,)


In [95]:
x = tvm.nd.array(np.random.randint(0,10, params.shape[0], dtype=np.int32),device=device)
y = vm['forward'](x, [tvm.nd.array(params)])

In [10]:
print(inspect.getsource(LayerScale))

class LayerScale(nn.Module):
    def __init__(self, dim, init_values: float = 1e-5, inplace: bool =False):
        super().__init__()
        self.inplace = inplace
        self.dim = dim
        self.init_values = init_values

    def forward(self, x):
        # We'll need the learned gamma parameter
        return x

    def get_default_spec(self):
        mod_spec = {
            "forward": {
                "x": nn.spec.Tensor(["n"], "int32")
            },
            "$": {
                "param_mode": "packed",
                "effect_mode": "none"
                
            }
        }
        return nn.spec.ModuleSpec.from_raw(mod_spec, self)

