In [1]:
import torch
inputs = torch.tensor([
   [0.43, 0.15, 0.89], # Your     
   [0.55, 0.87, 0.66], # journey  
   [0.57, 0.85, 0.64], # starts   
   [0.22, 0.58, 0.33], # with     
   [0.77, 0.25, 0.10], # one      
   [0.05, 0.80, 0.55]] # step     
)

In [2]:
import torch.nn as nn
class SelfAttentionOwnClass:
    def __init__(self,d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        
    def forward(self,x):
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        
        attention_scores = queries @ keys.T
        
        d_k = keys.shape[-1]
        attention_weights = torch.softmax(
            attention_scores / d_k**0.5, dim=-1
        )
        context_vector = attention_weights @ values
        return context_vector

In [3]:
d_in = inputs.shape[1]
d_out = 2
d_in

3

In [4]:
sA = SelfAttentionOwnClass(d_in, d_out)

In [5]:
queries = sA.W_query(inputs)
keys =sA.W_key(inputs)

attention_score = queries @ keys.T
attention_score

tensor([[-0.0497,  0.1499,  0.1520,  0.0963,  0.1459,  0.0811],
        [-0.1100,  0.2040,  0.2082,  0.1338,  0.2247,  0.0993],
        [-0.1077,  0.2007,  0.2048,  0.1316,  0.2207,  0.0978],
        [-0.0676,  0.1138,  0.1164,  0.0751,  0.1293,  0.0538],
        [-0.0366,  0.0842,  0.0856,  0.0547,  0.0873,  0.0433],
        [-0.0932,  0.1535,  0.1570,  0.1014,  0.1755,  0.0720]],
       grad_fn=<MmBackward0>)

In [6]:
# without masked 
d_k = keys.shape[-1]
attention_weight = torch.softmax(attention_score / d_k**0.5, dim=-1)
attention_weight

tensor([[0.1502, 0.1729, 0.1732, 0.1665, 0.1724, 0.1647],
        [0.1405, 0.1755, 0.1760, 0.1670, 0.1781, 0.1630],
        [0.1410, 0.1753, 0.1758, 0.1670, 0.1778, 0.1630],
        [0.1510, 0.1717, 0.1720, 0.1671, 0.1736, 0.1646],
        [0.1564, 0.1703, 0.1705, 0.1668, 0.1707, 0.1654],
        [0.1457, 0.1734, 0.1739, 0.1672, 0.1762, 0.1637]],
       grad_fn=<SoftmaxBackward0>)

In [7]:
# Using mask
context_length = attention_score.shape[0]
mask = torch.triu(torch.ones(context_length,context_length), diagonal=1)
mask

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])

In [8]:
masked_attention_score =  attention_score.masked_fill(mask.bool(), -torch.inf)
masked_attention_score

tensor([[-0.0497,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.1100,  0.2040,    -inf,    -inf,    -inf,    -inf],
        [-0.1077,  0.2007,  0.2048,    -inf,    -inf,    -inf],
        [-0.0676,  0.1138,  0.1164,  0.0751,    -inf,    -inf],
        [-0.0366,  0.0842,  0.0856,  0.0547,  0.0873,    -inf],
        [-0.0932,  0.1535,  0.1570,  0.1014,  0.1755,  0.0720]],
       grad_fn=<MaskedFillBackward0>)

In [9]:
d_k = keys.shape[-1]
masked_attention_weight = torch.softmax(masked_attention_score / d_k**0.5, dim=-1)
masked_attention_weight

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4447, 0.5553, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2865, 0.3563, 0.3573, 0.0000, 0.0000, 0.0000],
        [0.2282, 0.2594, 0.2599, 0.2524, 0.0000, 0.0000],
        [0.1873, 0.2041, 0.2043, 0.1998, 0.2045, 0.0000],
        [0.1457, 0.1734, 0.1739, 0.1672, 0.1762, 0.1637]],
       grad_fn=<SoftmaxBackward0>)

# Using Dropout

In [10]:
torch.manual_seed(123)
dropout = nn.Dropout(0.5)
example = torch.ones(6,6)

In [11]:
dropout(example)

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])

In [12]:
dropout(masked_attention_weight)

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.7146, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5189, 0.0000, 0.5049, 0.0000, 0.0000],
        [0.0000, 0.4081, 0.4085, 0.3997, 0.4090, 0.0000],
        [0.2913, 0.3469, 0.0000, 0.0000, 0.3523, 0.3274]],
       grad_fn=<MulBackward0>)