<a href="https://colab.research.google.com/github/ishammansoor/AI-and-Machine-Learning/blob/main/Self_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class SelfAttention(nn.Module):
  def __init__(self, embed_dim):
    super(SelfAttention, self).__init__()

    self.embed_dim = embed_dim

    self.query = nn.Linear(embed_dim, embed_dim)
    self.key = nn.Linear(embed_dim, embed_dim)
    self.value = nn.Linear(embed_dim, embed_dim)

    self.out_proj = nn.Linear(embed_dim, embed_dim)

  def forward(self, x):
    B, T, E = x.size()

    # step1: compute the Q, K, V
    Q = self.query(x)
    K = self.key(x)
    V = self.value(x)

    # step2: Compute scaled dot product
    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / E ** 0.5
    attn_weights = F.softmax(attn_scores, dim=-1)

    #step3: Apply weight to values
    attn_output = torch.matmul(attn_weights, V)


    output = self.out_proj(attn_output)

    return output, attn_weights




In [5]:
batch_size = 2
seq_len = 4
embed_dim = 8

x = torch.randn(batch_size, seq_len, embed_dim)  # random input
sa = SelfAttention(embed_dim)

out, weights = sa(x)
print("Output shape:", out.shape)       # Should be (2, 4, 8)
print("Attention weights:", weights) # should be (2, 4, 4)

Output shape: torch.Size([2, 4, 8])
Attention weights: tensor([[[0.1028, 0.4211, 0.3952, 0.0809],
         [0.2004, 0.4169, 0.1727, 0.2100],
         [0.0898, 0.3850, 0.3213, 0.2040],
         [0.1874, 0.2249, 0.3974, 0.1903]],

        [[0.2335, 0.2802, 0.2175, 0.2687],
         [0.1214, 0.1278, 0.2812, 0.4695],
         [0.2710, 0.2676, 0.2409, 0.2205],
         [0.3474, 0.2521, 0.1914, 0.2092]]], grad_fn=<SoftmaxBackward0>)
