# Q1: Implement a Self Attention Step for the Given Vector
```python
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your   (x^1)
     [0.55, 0.87, 0.66],  # journey (x^2)
     [0.57, 0.85, 0.64],  # starts (x^3)
     [0.22, 0.58, 0.33],  # with   (x^4)
     [0.77, 0.25, 0.10],  # one    (x^5)
     [0.05, 0.80, 0.55]], # step   (x^6)
    dtype=torch.float32
)



```

Expected output
`Context vector for 'journey': tensor([0.4419, 0.6515, 0.5683])`

In [16]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your   (x^1)
     [0.55, 0.87, 0.66],  # journey (x^2)
     [0.57, 0.85, 0.64],  # starts (x^3)
     [0.22, 0.58, 0.33],  # with   (x^4)
     [0.77, 0.25, 0.10],  # one    (x^5)
     [0.05, 0.80, 0.55]], # step   (x^6)
    dtype=torch.float32
)

x_2 = inputs[1]
unnormalized_attn_scores = x_2 @ (inputs).transpose(0,1)
normalized_attn_scores = torch.softmax(unnormalized_attn_scores, dim=0) #column)
context_vector = normalized_attn_scores @ inputs
print("Context for journey: ", context_vector)

#even though mathematically represented differently, operations for keys and queries are all very similiar.

Context for journey:  tensor([0.4419, 0.6515, 0.5683])


# Self Attention with Trainable Projections

Given the input tensor, return the all context vectors

Expected output:

```python
tensor([[0.2996, 0.8053],
[0.3061, 0.8210],
[0.3058, 0.8203],
[0.2948, 0.7939],
[0.2927, 0.7891],
[0.2990, 0.8040]], grad_fn=<MmBackward0>)
```

In [25]:
import torch
import torch.nn as nn
import math

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your   (x^1)
     [0.55, 0.87, 0.66],  # journey (x^2)
     [0.57, 0.85, 0.64],  # starts (x^3)
     [0.22, 0.58, 0.33],  # with   (x^4)
     [0.77, 0.25, 0.10],  # one    (x^5)
     [0.05, 0.80, 0.55]], # step   (x^6)
    dtype=torch.float32
)

#for randomness at some point
torch.manual_seed(123)
#d_in represents to how much elements per vector in inputs
#d_out is design choice .putting in a pin, acccepting fo rnow
d_in, d_out = inputs.shape[1], 2

#initializing random weights
W_query = nn.Parameter(torch.rand(d_in, d_out))
W_key = nn.Parameter(torch.rand(d_in, d_out))
W_value = nn.Parameter(torch.rand(d_in,d_out))


#calculating values
queries = inputs @ W_query
keys= inputs @ W_key
values = inputs @ W_value

'''
notice by this stage we have stuff in the space of the answer. but
it's important to know, as a question im going to ask,
why exactly the shape evolves the way it does.
'''

unnormalized_attn_scores = queries @ keys.transpose(0,1)
normalized_attn_scores = torch.softmax((unnormalized_attn_scores) / math.sqrt(keys.shape[-1]), dim=-1) #dim -1 is applied whenever we are
#calculating entire tensors of information. dim=0 was only because we were only dealing with 1
context_vectors = normalized_attn_scores @ values
print(context_vectors, context_vectors.shape)







tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>) torch.Size([6, 2])


# Q3: Casual Attention with Masking Tokens

Expected Output:
```python
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
[0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
[0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
[0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
[0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
grad_fn=<DivBackward0>)
```

In [36]:
import torch.nn as nn
import torch
import math

torch.manual_seed(123)

context_length = unnormalized_attn_scores.shape[0] #accept this to be true, it makes snse it has to be the same size anyway
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1) #just creates an empty vector that is the mask
masked = unnormalized_attn_scores.masked_fill(mask.bool(), -torch.inf)
normalized_attn_scores = torch.softmax(masked / keys.shape[-1]**0.5, dim=1)
print(normalized_attn_scores)


#accept most of this as truth, it's how to implement it


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3986, 0.6014, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2526, 0.3791, 0.3683, 0.0000, 0.0000, 0.0000],
        [0.2265, 0.2839, 0.2794, 0.2103, 0.0000, 0.0000],
        [0.1952, 0.2363, 0.2331, 0.1820, 0.1534, 0.0000],
        [0.1557, 0.2092, 0.2048, 0.1419, 0.1089, 0.1794]],
       grad_fn=<SoftmaxBackward0>)

In [69]:
#imports=========================
import torch
import torch.nn as nn
#imports=========================

class CasualAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_keys = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_values = nn.Linear(d_in, d_out, bias=qkv_bias)
        # also need to initialize mask and dropout
        # Corrected: Initialize dropout as an nn.Dropout module
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
        self.context_length = context_length
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        queries, keys, values = self.W_query(x), self.W_keys(x), self.W_values(x)
        unnormalized_attn_scores = queries @ (keys.transpose(1,2))

        #mas/dropout phase
        unnormalized_attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        normalized_attn_scores = torch.softmax(unnormalized_attn_scores / keys.shape[-1]**0.5, dim=1)
        normalized_attn_scores = self.dropout(normalized_attn_scores)

        #return context vector

        return normalized_attn_scores @ values


#=========================
'''
file: SelfAttentionv1.py
purpose: Initiate a class to implement self attention mechanisms.
'''
#=========================



class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        '''
        nn.Linear asserts the weight values, but also can be used to calcluate the queries
        values
        '''
    def forward(self, x):# where x is the tensor
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        unnormalized_attn_scores = queries @ keys.transpose(0,1)
        normalized_attn_scores = torch.softmax((unnormalized_attn_scores) / (keys.shape[-1])**(.5), dim=-1)
        context_vector = normalized_attn_scores @ values

        return context_vector

In [72]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your   (x^1)
     [0.55, 0.87, 0.66],  # journey (x^2)
     [0.57, 0.85, 0.64],  # starts (x^3)
     [0.22, 0.58, 0.33],  # with   (x^4)
     [0.77, 0.25, 0.10],  # one    (x^5)
     [0.05, 0.80, 0.55]], # step   (x^6)
    dtype=torch.float32
)

torch.manual_seed(123)
batch = torch.stack((inputs, inputs), dim=0)
# Corrected: access the integer value from inputs[0].shape
d_in, d_out, context_length = inputs[0].shape[0], 2, batch.shape[1]
ca_mech = CasualAttention(d_in, d_out, context_length, 0.0)
context_batch = ca_mech.forward(batch)
print(context_batch)


sa_mech = SelfAttention(d_in, d_out)
context_tensor = sa_mech.forward(inputs)
print(context_tensor)

tensor([[[-0.0844,  0.0414],
         [-0.2264, -0.0039],
         [-0.4163, -0.0564],
         [-0.5014, -0.1011],
         [-0.7754, -0.1867],
         [-1.1632, -0.3303]],

        [[-0.0844,  0.0414],
         [-0.2264, -0.0039],
         [-0.4163, -0.0564],
         [-0.5014, -0.1011],
         [-0.7754, -0.1867],
         [-1.1632, -0.3303]]], grad_fn=<UnsafeViewBackward0>)
tensor([[0.5085, 0.3508],
        [0.5084, 0.3508],
        [0.5084, 0.3506],
        [0.5074, 0.3471],
        [0.5076, 0.3446],
        [0.5077, 0.3493]], grad_fn=<MmBackward0>)
