In [1]:
import torch
import torch.nn as nn

In [2]:
import random
random.seed(24)  # Python random seed
torch.manual_seed(24)  # PyTorch seed (CPU)

<torch._C.Generator at 0x106743dd0>

In [3]:
# Set print options: No scientific notation, 2 decimal places
torch.set_printoptions(sci_mode=False, precision=4)

# Initialize LayerNorm with 10 features

In [4]:
# Initialize LayerNorm with 10 features
layer_norm = nn.LayerNorm(10)

In [5]:
layer_norm

LayerNorm((10,), eps=1e-05, elementwise_affine=True)

# Create a random input tensor with shape (batch_size=3, features=10)

In [6]:
# Create a random input tensor with shape (batch_size=3, features=10)
x = torch.randn(3, 10)

![Description](layer_norm_image.png)

# Compute mean and std of x before normalization

In [7]:
# Compute mean and std of x before normalization
print("Before LayerNorm:")
print("Input:", x)
print("Mean:", x.mean(dim=-1, keepdim=True))
print("Std:", x.std(dim=-1, unbiased=False, keepdim=True))

Before LayerNorm:
Input: tensor([[-1.5405, -0.3515, -0.3186, -1.2221,  2.1723, -0.5020,  0.9029, -1.6458,
          0.7199,  0.9037],
        [ 0.2337, -0.1412,  1.4302,  0.6298, -0.1468, -0.9044,  0.0280,  0.3196,
         -0.9579,  2.6285],
        [-1.8652,  0.9344, -0.8441,  0.1529, -2.8440, -0.0912,  1.5258, -0.2799,
         -1.0172,  1.2744]])
Mean: tensor([[-0.0882],
        [ 0.3119],
        [-0.3054]])
Std: tensor([[1.1766],
        [1.0155],
        [1.3159]])


# Apply Layer Normalization

In [8]:
# Apply Layer Normalization
output = layer_norm(x)

In [9]:
print("Output:", output)

Output: tensor([[-1.2344, -0.2238, -0.1959, -0.9637,  1.9212, -0.3517,  0.8423, -1.3238,
          0.6868,  0.8430],
        [-0.0771, -0.4462,  1.1012,  0.3130, -0.4518, -1.1978, -0.2796,  0.0075,
         -1.2504,  2.2812],
        [-1.1853,  0.9422, -0.4093,  0.3483, -1.9291,  0.1628,  1.3915,  0.0194,
         -0.5409,  1.2005]], grad_fn=<NativeLayerNormBackward0>)


# Print the gain (γ) and bias (β)

In [10]:
# Print the gain (γ) and bias (β)
print("Gain (γ):", layer_norm.weight)  # Scaling parameter
print("Bias (β):", layer_norm.bias)    # Shifting parameter

Gain (γ): Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)
Bias (β): Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


# Compute mean and std of x after normalization

In [11]:
# Compute mean and std of x after normalization
print("\nAfter LayerNorm:")
print("Mean:", output.mean(dim=-1, keepdim=True))  # Should be close to 0
print("Std:", output.std(dim=-1, unbiased=False, keepdim=True))  # Should be close to 1


After LayerNorm:
Mean: tensor([[     0.0000],
        [    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Std: tensor([[1.0000],
        [1.0000],
        [1.0000]], grad_fn=<StdBackward0>)


# Simple Model to demonstrate gamma and beta parameters

In [12]:
import torch.optim as optim

# Dummy dataset: 10 samples, 5 features each
x = torch.randn(10, 5)
y = torch.randn(10, 5)  # Regression target

# Define a simple model with LayerNorm
class SimpleModel(nn.Module):
    def __init__(self, feature_dim):
        super(SimpleModel, self).__init__()
        self.layernorm = nn.LayerNorm(feature_dim)
        self.fc = nn.Linear(feature_dim, feature_dim)  # Fully connected layer

    def forward(self, x):
        x = self.layernorm(x)  # Apply LayerNorm
        return self.fc(x)  # Pass through a linear layer

# Initialize model
model = SimpleModel(feature_dim=5)

# Print initial gamma and beta

In [13]:
# Print initial gamma and beta
print("Before Training:")
print("Initial Gain (γ):", model.layernorm.weight.data)
print("Initial Bias (β):", model.layernorm.bias.data)

Before Training:
Initial Gain (γ): tensor([1., 1., 1., 1., 1.])
Initial Bias (β): tensor([0., 0., 0., 0., 0.])


# Train for one iteration

In [14]:
# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Train for one iteration
for epoch in range(1):
    optimizer.zero_grad()
    outputs = model(x)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()

# Print gamma and beta after one training step

In [15]:
# Print gamma and beta after one training step
print("\nAfter Training One Iteration:")
print("Updated Gain (γ):", model.layernorm.weight.data)
print("Updated Bias (β):", model.layernorm.bias.data)


After Training One Iteration:
Updated Gain (γ): tensor([0.9992, 0.9589, 0.9930, 0.9822, 1.0058])
Updated Bias (β): tensor([-0.0157, -0.0242,  0.0172,  0.0083, -0.0047])
