<a href="https://colab.research.google.com/github/itsmepriyabrata/priyabrata_ai_python/blob/main/Transformer_math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F

# Define the input embeddings (sequence length = 2, embedding size = 4)
X = torch.tensor([[1.0, 0.0, 1.0, 0.0],
                  [0.0, 2.0, 0.0, 1.0]])

# Define the weight matrices (for simplicity, we use small random values)
d_model = 4
d_k = 4

W_Q = torch.rand(d_model, d_k)
W_K = torch.rand(d_model, d_k)
W_V = torch.rand(d_model, d_k)

# Compute Q, K, V
Q = torch.matmul(X, W_Q)
K = torch.matmul(X, W_K)
V = torch.matmul(X, W_V)

# Compute attention scores
scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

# Apply softmax to get attention weights
attention_weights = F.softmax(scores, dim=-1)

# Compute the attention output
attention_output = torch.matmul(attention_weights, V)

print("Input Embeddings:\n", X)
print("Queries (Q):\n", Q)
print("Keys (K):\n", K)
print("Values (V):\n", V)
print("Attention Scores:\n", scores)
print("Attention Weights:\n", attention_weights)
print("Attention Output:\n", attention_output)


Input Embeddings:
 tensor([[1., 0., 1., 0.],
        [0., 2., 0., 1.]])
Queries (Q):
 tensor([[0.3883, 1.2941, 1.3072, 1.5628],
        [1.4930, 1.6263, 1.5802, 1.5664]])
Keys (K):
 tensor([[1.2488, 1.0988, 0.7671, 1.3177],
        [1.5061, 0.6642, 1.5973, 2.1780]])
Values (V):
 tensor([[1.1547, 0.7331, 0.9069, 1.1389],
        [1.1839, 1.6739, 2.3724, 1.2075]])
Attention Scores:
 tensor([[2.4845, 3.4681],
        [3.4638, 4.6322]])
Attention Weights:
 tensor([[0.2722, 0.7278],
        [0.2372, 0.7628]])
Attention Output:
 tensor([[1.1760, 1.4178, 1.9735, 1.1888],
        [1.1770, 1.4508, 2.0248, 1.1912]])


In [2]:
import torch
import torch.nn.functional as F

# Define the input matrices (batch size = 1, sequence length = 2, embedding size = 4)
Q = torch.tensor([[1.0, 0.0, 1.0, 0.0],
                  [0.0, 2.0, 0.0, 1.0]])

K = torch.tensor([[1.0, 0.0, 1.0, 0.0],
                  [0.0, 2.0, 0.0, 1.0]])

V = torch.tensor([[0.5, 1.0, 0.5, 1.0],
                  [1.0, 0.5, 1.0, 0.5]])

# Dimensionality of the keys
d_k = K.shape[-1]

# Compute the dot products of Q and K transpose
scores = torch.matmul(Q, K.transpose(-2, -1))

# Scale the scores by the square root of d_k
scaled_scores = scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

# Apply the softmax function to get the attention weights
attention_weights = F.softmax(scaled_scores, dim=-1)

# Compute the output by multiplying the attention weights with V
output = torch.matmul(attention_weights, V)

print("Queries (Q):\n", Q)
print("Keys (K):\n", K)
print("Values (V):\n", V)
print("Dot-Product Scores:\n", scores)
print("Scaled Scores:\n", scaled_scores)
print("Attention Weights:\n", attention_weights)
print("Output:\n", output)


Queries (Q):
 tensor([[1., 0., 1., 0.],
        [0., 2., 0., 1.]])
Keys (K):
 tensor([[1., 0., 1., 0.],
        [0., 2., 0., 1.]])
Values (V):
 tensor([[0.5000, 1.0000, 0.5000, 1.0000],
        [1.0000, 0.5000, 1.0000, 0.5000]])
Dot-Product Scores:
 tensor([[2., 0.],
        [0., 5.]])
Scaled Scores:
 tensor([[1.0000, 0.0000],
        [0.0000, 2.5000]])
Attention Weights:
 tensor([[0.7311, 0.2689],
        [0.0759, 0.9241]])
Output:
 tensor([[0.6345, 0.8655, 0.6345, 0.8655],
        [0.9621, 0.5379, 0.9621, 0.5379]])


In [3]:
import torch
import torch.nn.functional as F

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads

        # Define linear projections for Q, K, V for each head
        self.W_Q = torch.nn.Linear(d_model, d_model)
        self.W_K = torch.nn.Linear(d_model, d_model)
        self.W_V = torch.nn.Linear(d_model, d_model)

        # Define linear projection for the output
        self.W_O = torch.nn.Linear(d_model, d_model)

    def forward(self, X):
        batch_size, seq_length, d_model = X.size()

        # Linear projections
        Q = self.W_Q(X)
        K = self.W_K(X)
        V = self.W_V(X)

        # Split into multiple heads
        Q = Q.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention for each head
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        attention_weights = F.softmax(scores, dim=-1)
        attention_output = torch.matmul(attention_weights, V)

        # Concatenate heads and apply final linear projection
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_length, d_model)
        output = self.W_O(attention_output)

        return output

# Define the input embeddings (batch size = 1, sequence length = 2, embedding size = 4)
X = torch.tensor([[[1.0, 0.0, 1.0, 0.0],
                   [0.0, 2.0, 0.0, 1.0]]])

# Define the multi-head attention module (embedding size = 4, number of heads = 2)
mha = MultiHeadAttention(d_model=4, num_heads=2)

# Forward pass through the multi-head attention
output = mha(X)

print("Input Embeddings:\n", X)
print("Output of Multi-Head Attention:\n", output)


Input Embeddings:
 tensor([[[1., 0., 1., 0.],
         [0., 2., 0., 1.]]])
Output of Multi-Head Attention:
 tensor([[[-0.6001, -0.4562, -0.4179, -0.0301],
         [-0.5855, -0.4454, -0.3797, -0.0267]]], grad_fn=<ViewBackward0>)


In [4]:
import torch
import math

class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

        # Create a matrix of [max_len, d_model] with positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

# Example usage:
# Define the input embeddings (batch size = 1, sequence length = 3, embedding size = 4)
X = torch.tensor([[[1.0, 0.0, 1.0, 0.0],
                   [0.0, 2.0, 0.0, 1.0],
                   [1.0, 2.0, 1.0, 0.0]]])

# Define the positional encoding module (embedding size = 4)
pe = PositionalEncoding(d_model=4, max_len=10)

# Forward pass through the positional encoding
output = pe(X)

print("Input Embeddings:\n", X)
print("Positional Encodings:\n", pe.pe[0, :3, :])
print("Output with Positional Encoding:\n", output)


Input Embeddings:
 tensor([[[1., 0., 1., 0.],
         [0., 2., 0., 1.],
         [1., 2., 1., 0.]]])
Positional Encodings:
 tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998]])
Output with Positional Encoding:
 tensor([[[1.0000, 1.0000, 1.0000, 1.0000],
         [0.8415, 2.5403, 0.0100, 1.9999],
         [1.9093, 1.5839, 1.0200, 0.9998]]])


In [5]:
import torch
import torch.nn.functional as F

class FeedForwardNetwork(torch.nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = torch.nn.Linear(d_model, d_ff)
        self.linear2 = torch.nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        return x

# Example usage:
# Define the input embeddings (batch size = 1, sequence length = 3, embedding size = 4)
X = torch.tensor([[[1.0, 0.0, 1.0, 0.0],
                   [0.0, 2.0, 0.0, 1.0],
                   [1.0, 2.0, 1.0, 0.0]]])

# Define the feed-forward network (embedding size = 4, feed-forward size = 8)
ffn = FeedForwardNetwork(d_model=4, d_ff=8)

# Forward pass through the feed-forward network
output = ffn(X)

print("Input Embeddings:\n", X)
print("Output of Feed-Forward Network:\n", output)


Input Embeddings:
 tensor([[[1., 0., 1., 0.],
         [0., 2., 0., 1.],
         [1., 2., 1., 0.]]])
Output of Feed-Forward Network:
 tensor([[[-0.1774, -0.1202,  0.1628,  0.0524],
         [-0.3373, -0.0402,  0.2143,  0.1117],
         [-0.3009, -0.0594,  0.2037,  0.0570]]], grad_fn=<ViewBackward0>)
