In [1]:
# We will built a character-level Generative Pre-Trained Transformer (GPT) network.
# The network should generate infinite shakespeare :P
# We do this, similar to nevermore, by taking in a context of characters and predicting the next character (or "token")
# Therefore, by learning the idiosyncracies of human speech, in this case of all Shakespeare's works, we will learn which tokens are likely to follow for any given context based on probabilities

In [2]:
with open("tiny-shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read() 

In [3]:
print(f"Length of dataset in characters: {len(text)}")

Length of dataset in characters: 1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
chars = sorted(set(text))
vocab_size = len(chars)

In [6]:
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}
encode = lambda s: [stoi[character] for character in s] # encoder: take in a string and return list of integers
decode = lambda l: "".join([itos[i] for i in l]) # decoder: take in a list of integers and return a string

print(stoi["B"])
print(itos[14])
print(encode("B"))
print(decode(encode("B")))

print("hello there")
print(encode("hello there"))
print(decode(encode("hello there")))

14
B
[14]
B
hello there
[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [8]:
# so we start by encoding the entire dataset into tensors of tokens using pyTorch
import torch
data = torch.tensor(encode(text))
print(data.shape, data.dtype)
print(data[:1000]) # this is what the first 1000 characters of our dataset will look like to the GPT

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [9]:
# Split into train and validation data
train_idx = int(0.9 * len(data)) # first 90% train; remaining will be used as validation
train_data = data[:train_idx]
val_data = data[train_idx:]

In [10]:
block_size = 8 # sometimes referred to as context_size
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
# Quick example to highlight the parallel training of transformers
x = train_data[:block_size+1] # Why +1 to block_size? So that the full block/context can be used to predict as well by including the token that would come after the provided context
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    # target = x[t+1] same as target = y[t]
    target = y[t]
    print(f"When context is {context}, target is {target}")


# Why are transformers so cool? As we can see, with a context or block size of 8, we will train 8 sequences in parallel!
# Transformers are not optimizing sequentially (as RNNs are), but compute in parallel!
# This is what allows for the extreme scaling of context size and therefore the explosion of training sizes, i.e. the entire web
# But besides efficiency, this is also amazing since it automatically allows the transformer to see all possible context windows, i.e. very small sequences and really large blocks of text
# So the transformer automatically learns to predict the next token for many different circumstances
    
# If we use a context / block size of 8, we do 8 computations in parallel
# In order for this to work, we need to include the block_size + 1nth token as well
# So that the full context, in this case 8 tokens, can be used as an individual example as well
# were we to ommit the +1, we could only train on context up to the len(context) - 1nth element.

When context is tensor([18]), target is 47
When context is tensor([18, 47]), target is 56
When context is tensor([18, 47, 56]), target is 57
When context is tensor([18, 47, 56, 57]), target is 58
When context is tensor([18, 47, 56, 57, 58]), target is 1
When context is tensor([18, 47, 56, 57, 58,  1]), target is 15
When context is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
When context is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [12]:
# Why utilize batches?
# To put it simply: To keep the GPUs busy.
# GPUs are very good at parallel tasks; therefore, we want to divide our training set into independent chunks and run them in parallel on different areas of the GPUs.

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split =="train" else val_data
    # IMPORTANT NOTE: w/o -block_size, we would run into potential errors if we sample a random integer that is less than block_size steps away from our max index! Take care.
    # This also enables our block_size to always be guaranteed a target value. Think about it.
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generate random starting point indices for a sequence
    x = torch.stack([data[i:i+block_size] for i in ix]) # extract the sequence from its random starting point
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # extract the sequence of targets by one-offsetting

    return x, y

xb, yb = get_batch("train") # b for batch
print("inputs:")
print(xb.shape)
print(xb)
# Important:
print(f"This {xb.shape} array contains a total of {xb.shape[0] * xb.shape[1]} completely independent examples.")
print(f"Since each row has {xb.shape[1]} samples and we provide {xb.shape[0]} rows per training batch.")
print(" "*10)
print("outputs:")
print(yb.shape)
print(yb)

print("-"*50)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()}, target is {target}")
    print("*"*30)




inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
This torch.Size([4, 8]) array contains a total of 32 completely independent examples.
Since each row has 8 samples and we provide 4 rows per training batch.
          
outputs:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
--------------------------------------------------
when input is [24], target is 43
when input is [24, 43], target is 58
when input is [24, 43, 58], target is 5
when input is [24, 43, 58, 5], target is 57
when input is [24, 43, 58, 5, 57], target is 1
when input is [24, 43, 58, 5, 57, 1], target is 46
when input is [24, 43, 58, 5, 57, 1, 46], target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43], target is 39
**********************

# Start with the simplest language model - the Bigram Model

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt

torch.manual_seed(1337);

In [14]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads of the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensors of integers
        logits = self.token_embedding_table(idx) # (B, T, C) <-- Batch, Time, Channel

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # instead of 4x8x65, we stretch out the array to 32x65
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array
        for _ in range(max_new_tokens):
            # get the prediction
            logits, loss = self(idx)
            # focus only on the last step
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # take next most likely character by sampling from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) 
            # append sampled index to the running sequence
            idx = torch.concat((idx, idx_next), dim=1) # (B, T+1)

        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [15]:
# create pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
# optimizer = torch.optim.SGD(m.parameters(), lr=1e-3) # we previously only used Stochastic Gradient Descent

In [16]:
batch_size=32
for steps in range(1000):

    # sample batch from data
    xb, yb = get_batch(train_data)

    # evaluate the loss
    logits, loss = m.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

3.672869920730591


In [17]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))


Wh;;SPif uslI'Tnc
kwgOj$dhPWr,SV?hsusiKpgXXUh;Apmem sunESX&GT;TrJgkiF-oKbXCAA -botrngFCHOptto$

pn$w-gHoi?wtd!
wLU-IfSK'bAw :M.ZtOptXEQcL?hfaofqbPd?O:PnQQJMap$aypupIBfGJsZaI'ottllo..k$W$Akp?yl?ajKlzY!lx&QQLW? t,bXFkyhl-dmVsHeckhRl,jSClgjuk:3Iv
?OqlrV;!Plxfzgy;;
'mRjuBQ&xk!$
h
SiruDJgKuDny,S$ERf.?GSV-ivvKcOvi-nQGX&q-YQbm dEM?px;Akr-IENAc-wIWcd
RFgXTpDUgM:CH.D&uo'IBT -
j?wfy fFr.&fiqtRS.ZttxGh KG'd!rn$zoZqbocL&yIffBDWNGbo,Se,
o.Fls,?,M?eZxHx,j?EV.mJiHqHnfF-wbQpa;P fawiF$-QbWv&f:CVDCBfano,b?$Esev.?


# The mathematical trick in self-attention

In [18]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn((B,T,C))
x.shape

torch.Size([4, 8, 2])

In [19]:
xbow = torch.zeros((B,T,C)) # bow = bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C) # remember that python indexing works UP TO! so in the case of [b, :2], we extract the 0th and 1st elements, while [b, :1] would only extract UP TO the first element, so only the 0th element
        xbow[b, t] = torch.mean(xprev, 0) # calculate average of previous elements in current batch

In [20]:
xbow.shape

torch.Size([4, 8, 2])

In [21]:
torch.manual_seed(42)
a = torch.ones((3,3))
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

# Important learning/reminder: Matrix Multiplication using a Matrix of Ones is just summation!!!

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [22]:
# now, what would happen, if we were to use a triangular matrix
# A square matrix is called lower triangular if all the entries above the main diagonal are zero
torch.manual_seed(42)
a = torch.tril(torch.ones((3,3)))
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [23]:
# and what would happen, were we to normalize across the rows of our triangular matrix of 1s?
# we enable efficient average calculation using matrix multiplication
torch.manual_seed(42)
a = torch.tril(torch.ones((3,3)))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [24]:
mask = torch.tril(torch.ones(3,3)) # initialize seperate lower triangular mask
print("mask=")
print(mask)

a = torch.zeros((3,3)) # this is the weight matrix
print("a before masking=")
print(a)
a = a.masked_fill(mask == 0, float("-inf"))
print("a after masking=")
print(a)
a = F.softmax(a, dim=-1)
print("a after softmax=")
print(a)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b



print("b=")
print(b)
print("c=")
print(c)

mask=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
a before masking=
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])
a after masking=
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])
a after softmax=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[0., 4.],
        [0., 3.],
        [8., 4.]])
c=
tensor([[0.0000, 4.0000],
        [0.0000, 3.5000],
        [2.6667, 3.6667]])


In [25]:
# version 3: Using softmax

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T)) # weights
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow = wei @ x

In [30]:
torch.manual_seed(1337)
# We have 4 batches with 8 tokens each and each token has 32 channels, i.e. is encoded by 32 features
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn((B,T,C))

# simple average (weighted sum) of all past tokens and the current token. Previous information and current information are fused together using a simple average
tril = torch.tril(torch.ones(T,T)) # initialize seperate lower triangular mask
wei = torch.zeros((T,T)) # initializing these affinities / weights as 0 leads to the simply average (since all values are uniform)
wei = wei.masked_fill(tril == 0, float("-inf")) # fill with -inf
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [None]:
# let's implement actual data-dependent affinities ugin so-called Heads
# so that we can move on from a simple average

In [31]:
torch.manual_seed(1337)
# We have 4 batches with 8 tokens each and each token has 32 channels, i.e. is encoded by 32 features
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn((B,T,C))

# simple average (weighted sum) of all past tokens and the current token. Previous information and current information are fused together using a simple average
head_size = 16
key = nn.Linear(C, head_size, bias=False).to(x.device)
query = nn.Linear(C, head_size, bias=False).to(x.device)
k = key(x)      # (B, T, 16)
q = query(x)    # (B, T, 16)
# --------- up to this point, no communication between the tokens has happened!!!
# The communication comes now
wei = q @ k.transpose(-2,-1)    # (B,T,16) @ (B,16,T) -> (B,T,T) --> square matrix of affinities

tril = torch.tril(torch.ones(T,T)) # initialize seperate lower triangular mask
# wei = torch.zeros((T,T)) # initializing these affinities / weights as 0 leads to the simply average (since all values are uniform)
wei = wei.masked_fill(tril == 0, float("-inf")) # fill with -inf
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [33]:
# now, every single batch will have individualized / data-dependent weights based on its tokens and their affinities to each other
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [34]:
wei[0]
# let's investigate this.
# below, we see the weights for one batch with 8 tokens.
# How to read this?
# In the first row, we investigate the first token.
# And the first token is only allowed to know about past tokens and itself - so in this first case, only itself.
# Therefore, it aggregates information from all previous tokens and itself. And of course, with only one token, its affinity is 1 (since it knows everything).
# In the second row, we investigate the affinities for the second token. It has a low affinity to the first token and a high affinity to itself.
# In the third row, we see that the third token has a somewhat equal affinity to the first and second token and the highest affinity to itself.
# Moving along, for the eigth row, we can see that the eigth token has a high affinity to the fourth and seventh token as well as to itself.


# NOW FOR SOME INTUITION, WHAT IS HAPPENING HERE?
# If I am a token, I know about myself / my own content as well as my position in the sequence.
# Therefore, I will embed this information into my own embedding as Key (K) and use it to compute affinities with other tokens.
# So for example, if I am the eigth token in a sequence, and I am a vowel, I might look for consonants in the first four positions (that is my Query or Q).
# So if, in the first four positions, there exists a consonant, then it in turn will have encoded this information (I am a consonant in position idx = 3) into its Key (K).
# And by using the dot product matrix multiplication, I will match Q of the eight token with K from the third token and since the key has what I am looking for in my Query, it will produce a high affinity (in the below case 0.2297).

# And to come back to the reason for maskin: It simply limits my query to keys that have come before me since I am not allowed to loook into the future.

# So in short, the softmax normalized weight matrix below tells us how much information to aggregate from each individual prior token.
# Tokens with high affinity will contribute proportionally more to the aggregated result than tokens with low affinity.

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [36]:
# last step: include Value V besides Query Q and Key K

torch.manual_seed(1337)
# We have 4 batches with 8 tokens each and each token has 32 channels, i.e. is encoded by 32 features
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn((B,T,C))

# simple average (weighted sum) of all past tokens and the current token. Previous information and current information are fused together using a simple average
head_size = 16
key = nn.Linear(C, head_size, bias=False).to(x.device)
query = nn.Linear(C, head_size, bias=False).to(x.device)
value  = nn.Linear(C, head_size, bias=False).to(x.device)

k = key(x)      # (B, T, 16)
q = query(x)    # (B, T, 16)
# --------- up to this point, no communication between the tokens has happened!!!
# The communication comes now
wei = q @ k.transpose(-2,-1)    # (B,T,16) @ (B,16,T) -> (B,T,T) --> square matrix of affinities

tril = torch.tril(torch.ones(T,T)) # initialize seperate lower triangular mask
wei = wei.masked_fill(tril == 0, float("-inf"))  / head_size**0.5# fill with -inf
wei = F.softmax(wei, dim=-1)
v = value(x)     # (B, T, 16)
out = wei @ v # THIS IS NEW

out.shape

# Think about x as the PRIVATE information of a token.
# And v as the PUBLIC information.
# Therefore, we have the following:
# Query Q is what I am looking for (i.e. a consonant in positions 0 to 3)
# Key K is what I am offering (i.e. I am a vowel in position idx = 7)
# And v is the information that I contribute (i.e. I am the character "t")

torch.Size([4, 8, 16])

In [None]:
# Important note: Masking is only necessary in a so-called decoder block (when we are trying to decode language and generate new tokens)
# If, for example, we are interested in sentiment analysis of a sequence, we want the relations between all tokens in the sequence, i.e. forward and backward looking. In this case, we would use what is called an encoder block. And the only difference is, that we omit the masking part.

In [None]:
# Self-attention is called SELF-attention because the Q, K and Vs all come from the same source (in this case x)
# In Encoder-Decoder Structures, cross-attention is used since we have seperate sources that we want to pull and combine data from.
# In Cross-attention, the encoder produces Q, K and V values and the decoder produces Q, K and V values and those can match up with each other.