In [None]:
import numpy as np

epsilon = 1e-5

N, C, H, W = 5, 15, 10, 10
tensor = np.arange(N * C * H * W).reshape(N, C, H, W).astype(np.float32)

def get_stats(tensor):
    mean = np.mean(channel_data)
    variance = np.var(channel_data)
    return mean, variance

# Batch normlaization 

For each channel, get all the values across the batch and spatial dimension and compute the stats.
Apply the norm for each channel independently.

Example: 
For a image tensor of shape 15, 3, 10, 10 (N, C, H, W), the first channel will 
have 15 * 10 * 10 = 500 values to compute the mean and variance.

Batch norm is used in CNNs.

In [None]:
for c in range(C):
    channel_data = tensor[:, c, :, :]
    mean, variance = get_stats(channel_data)
    normalized_data = (channel_data - mean) / np.sqrt(variance + epsilon)
    tensor[:, c, :, :] = normalized_data

# Layer normalization

For each batch, get all the values across the channel and spatial dimension and compute the stats.
Apply the norm for each batch independently.

Example:
For a image tensor of shape 15, 3, 10, 10 (N, C, H, W), each batch will 
have 3 * 10 * 10 = 300 values to compute the mean and variance.

Layer norm is used in RNNs and NLPs.

In [None]:
for n in range(N):
    batch_data = tensor[n, :, :, :]
    mean, variance = get_stats(batch_data)
    normalized_data = (batch_data - mean) / np.sqrt(variance + epsilon)
    tensor[n, :, :, :] = normalized_data
    

# Instance normalization

For each batch and channel, get all the values across the spatial dimension 
and compute the stats.
Apply the norm for each batch and channel independently.

Example:
For a image tensor of shape 15, 3, 10, 10 (N, C, H, W), each batch and channel will
have 10 * 10 = 100 values to compute the mean and variance.

Instance norm is used in style transfer and semantic segmentation.

In [None]:

for n in range(N):
    for c in range(C):
        instance_data = tensor[n, c, :, :]
        mean, variance = get_stats(instance_data)
        normalized_data = (instance_data - mean) / np.sqrt(variance + epsilon)
        tensor[n, c, :, :] = normalized_data    

# Group normalization

For each batch, divide the channels into G groups. For each group, 
get all the values across the spatial dimension and compute the stats.
Apply the norm for each batch and group independently.  

Example:
For a image tensor of shape 15, 6, 10, 10 (N, C, H, W) and G=3, 
each batch will have 3 groups with 2 channels each.

Group norm is used in object detection and segmentation.

In [None]:
G = 3
channels_per_group = C // G
for n in range(N):
    for g in range(G):
        group_data = tensor[n, g * channels_per_group:(g + 1) * channels_per_group, :, :]
        mean, variance = get_stats(group_data)
        normalized_data = (group_data - mean) / np.sqrt(variance + epsilon)
        tensor[n, g * channels_per_group:(g + 1) * channels_per_group, :, :] = normalized_data
    