In [1]:
"""
---
title: Layer Normalization
summary: >
 A PyTorch implementation/tutorial of layer normalization.
---
# Layer Normalization
This is a [PyTorch](https://pytorch.org) implementation of
[Layer Normalization](https://papers.labml.ai/paper/1607.06450).
### Limitations of [Batch Normalization](../batch_norm/index.html)
* You need to maintain running means.
* Tricky for RNNs. Do you need different normalizations for each step?
* Doesn't work with small batch sizes;
large NLP models are usually trained with small batch sizes.
* Need to compute means and variances across devices in distributed training.
## Layer Normalization
Layer normalization is a simpler normalization method that works
on a wider range of settings.
Layer normalization transforms the inputs to have zero mean and unit variance
across the features.
*Note that batch normalization fixes the zero mean and unit variance for each element.*
Layer normalization does it for each batch across all elements.
Layer normalization is generally used for NLP tasks.
We have used layer normalization in most of the
[transformer implementations](../../transformers/gpt/index.html).
"""
from typing import Union, List

import torch
from torch import nn, Size


class PyLayerNorm(nn.Module):
    r"""
    ## Layer Normalization
    Layer normalization $\text{LN}$ normalizes the input $X$ as follows:
    When input $X \in \mathbb{R}^{B \times C}$ is a batch of embeddings,
    where $B$ is the batch size and $C$ is the number of features.
    $\gamma \in \mathbb{R}^{C}$ and $\beta \in \mathbb{R}^{C}$.
    $$\text{LN}(X) = \gamma
    \frac{X - \underset{C}{\mathbb{E}}[X]}{\sqrt{\underset{C}{Var}[X] + \epsilon}}
    + \beta$$
    When input $X \in \mathbb{R}^{L \times B \times C}$ is a batch of a sequence of embeddings,
    where $B$ is the batch size, $C$ is the number of channels, $L$ is the length of the sequence.
    $\gamma \in \mathbb{R}^{C}$ and $\beta \in \mathbb{R}^{C}$.
    $$\text{LN}(X) = \gamma
    \frac{X - \underset{C}{\mathbb{E}}[X]}{\sqrt{\underset{C}{Var}[X] + \epsilon}}
    + \beta$$
    When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
    where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width.
    This is not a widely used scenario.
    $\gamma \in \mathbb{R}^{C \times H \times W}$ and $\beta \in \mathbb{R}^{C \times H \times W}$.
    $$\text{LN}(X) = \gamma
    \frac{X - \underset{C, H, W}{\mathbb{E}}[X]}{\sqrt{\underset{C, H, W}{Var}[X] + \epsilon}}
    + \beta$$
    """

    def __init__(self, normalized_shape: Union[int, List[int], Size], *,
                 eps: float = 1e-5,
                 elementwise_affine: bool = True):
        """
        * `normalized_shape` $S$ is the shape of the elements (except the batch).
         The input should then be
         $X \in \mathbb{R}^{* \times S[0] \times S[1] \times ... \times S[n]}$
        * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
        * `elementwise_affine` is whether to scale and shift the normalized value
        We've tried to use the same names for arguments as PyTorch `LayerNorm` implementation.
        """
        super().__init__()

        # Convert `normalized_shape` to `torch.Size`
        if isinstance(normalized_shape, int):
            normalized_shape = torch.Size([normalized_shape])
        elif isinstance(normalized_shape, list):
            normalized_shape = torch.Size(normalized_shape)
        assert isinstance(normalized_shape, torch.Size)

        #
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        # Create parameters for $\gamma$ and $\beta$ for gain and bias
        if self.elementwise_affine:
            self.gain = nn.Parameter(torch.ones(normalized_shape))
            self.bias = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x: torch.Tensor):
        """
        `x` is a tensor of shape `[*, S[0], S[1], ..., S[n]]`.
        `*` could be any number of dimensions.
         For example, in an NLP task this will be
        `[seq_len, batch_size, features]`
        """
        # Sanity check to make sure the shapes match
        assert self.normalized_shape == x.shape[-len(self.normalized_shape):]

        # The dimensions to calculate the mean and variance on
        dims = [-(i + 1) for i in range(len(self.normalized_shape))]

        # Calculate the mean of all elements;
        # i.e. the means for each element $\mathbb{E}[X]$
        mean = x.mean(dim=dims, keepdim=True)
        # Calculate the squared mean of all elements;
        # i.e. the means for each element $\mathbb{E}[X^2]$
        mean_x2 = (x ** 2).mean(dim=dims, keepdim=True)
        # Variance of all element $Var[X] = \mathbb{E}[X^2] - \mathbb{E}[X]^2$
        var = mean_x2 - mean ** 2

        # Normalize $$\hat{X} = \frac{X - \mathbb{E}[X]}{\sqrt{Var[X] + \epsilon}}$$
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        # Scale and shift $$\text{LN}(x) = \gamma \hat{X} + \beta$$
        if self.elementwise_affine:
            x_norm = self.gain * x_norm + self.bias

        #
        return x_norm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import brunoflow as bf

torch.manual_seed(0)
x = torch.normal(mean=0, std=1, size=(5, 10))
x2 = x.clone().detach().requires_grad_(True)
x3 = bf.Node(x.clone().detach().numpy())

2022-12-20 10:32:31.541430: E external/org_tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error


In [3]:
bf_ln = bf.net.LayerNorm(10)
bf_ln_out = bf_ln(x3)
bf_ln_out.backprop()
bf_ln.weight.grad, bf_ln.bias.grad

(DeviceArray([ 1.2446905 , -4.288586  , -0.6201846 ,  4.830481  ,
               3.4588838 , -0.7314156 , -1.2973553 , -5.1509047 ,
               2.2078104 ,  0.34658068], dtype=float32),
 DeviceArray([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.], dtype=float32))

In [4]:
py_ln = PyLayerNorm(10)
py_ln_out = py_ln(x)
py_ln_out.backward(gradient=torch.ones_like(py_ln_out))
py_ln.gain.grad, py_ln.bias.grad

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


(tensor([ 1.2447, -4.2886, -0.6202,  4.8305,  3.4589, -0.7314, -1.2974, -5.1509,
          2.2078,  0.3466]),
 tensor([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.]))

In [5]:
ln = nn.LayerNorm(10)
ln_out = ln(x2)
ln_out.backward(gradient=torch.ones_like(ln_out))
ln.weight.grad, ln.bias.grad

(tensor([ 1.2447, -4.2886, -0.6202,  4.8305,  3.4589, -0.7314, -1.2974, -5.1509,
          2.2078,  0.3466]),
 tensor([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.]))

In [6]:
import brunoflow as bf
import torch
# bf.func.reduce_mean(x.numpy(), axis=(-1,)).val
node = bf.Node(torch.ones(size=(2,3,4)).numpy())
s = bf.func.reduce_sum(node, axis=(1, 2), keepdims=False)
# out = bf.matmul(s, bf.Node(torch.normal(mean=0, std=1, size=(4, 5)).numpy()))
# print(node.shape, s.val.shape, out.val.shape)
s.backprop()
node.grad.shape
s.grad
# , torch.sum(x, dim=(0,))


DeviceArray([1., 1.], dtype=float32)