In [1]:
# Download the tiny Shakespeare dataset to make
# different trials with it in an easy way
# !pip install wget
# !python -m wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o tiny_shakespeare.txt

In [2]:
with open('tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# first 1000 characters:
print(text[: 1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(f"Number of unique characters: {vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Number of unique characters: 65


## Tokenize
#### Convert the raw text as a string to some sequence of integers according to some vocabulary of possible elements

In [6]:
# Create a mapping from characters to integers

# String to integer
s2i = {ch:i for i,ch in enumerate(chars)}

# Integer to string
i2s = {i:ch for i, ch in enumerate(chars)}

# Encoder: take a string, output a list of integers
encode = lambda s: [s2i[c] for c in s]

# Decoder: Take a list of integers, output a string
decode = lambda l: "".join([i2s[i] for i in l])

example_phrase = "El ingenioso hidalgo don Quijote de la Mancha"
print(encode(example_phrase))
print(decode(encode(example_phrase)))

[17, 50, 1, 47, 52, 45, 43, 52, 47, 53, 57, 53, 1, 46, 47, 42, 39, 50, 45, 53, 1, 42, 53, 52, 1, 29, 59, 47, 48, 53, 58, 43, 1, 42, 43, 1, 50, 39, 1, 25, 39, 52, 41, 46, 39]
El ingenioso hidalgo don Quijote de la Mancha


In [7]:
# Encode entire Tiny Shakespeare dataset and store it into a torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print("\n\nFirst 1000 characters tokenized:\n")
print(data[:1000])

torch.Size([1115394]) torch.int64


First 1000 characters tokenized:

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47

In [8]:
# Split up the data into train and validation sets
# 90% -> training, 10% validation
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

## Data Loading

We have to choose a *block size*, which will take consist of a series of tokens fed together into the model.

When we sample a chunk of data this way, we have multiple samples packed into it, since all of the characters follow each other.

So, we will train the transformer to make predictions *at every one of these positions*.

If we select a block size of value 8, for instance, we will have a chunk of 9 characters, which will contain 8 individual examples packed in there.

For instance, in the example above: in the context of [18], 47 comes next; in the context of [18, 47], 56 comes next, and so on. That is why we have to insert a +1 in order to have the chosen number of examples in a given block.



In [9]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is: {target}")

When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [11]:
# Same code as above, both do the same thing but explained
# in different ways. The code above creates a y list which 
# 'is one step ahead of x', meaning that when we access an index
# i in the x list, accessing the same index i in the list y will
# give us the next element y the dataset

# This code does the same, but using one list, ranging from the
# beginning to the position at time t (including t, that is why [:t+1]),
# and then we acces the element at index t+1

x = train_data[:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = x[t+1]
    print(f"When input is {context}, the target is: {target}")

When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


This way, the transformer will see all the contexts from as little as one to all the way to the chosen block size. And we would like the transformer to be used to seeing everything in between.

And that is later gonna be useful when doing inference because *while we are sampling we can start the sampling generation* with as little as one character context. And the Transformer knows how to predict the next character with all the way up to just one context of one.

So, it can predict everything up to block size and after block size we have to start truncating because the transformer will never receive more than block size when it is predicting characters.

As we are sampling these chunks of text, every time we are going to fed them into a transformer we are going to have many batches of multiple chunks of text that are all stacked up in a single tensor.

In [12]:
# Reproducibility
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('\ntargets:')
print(yb.shape)
print(yb)

print('\n----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

----
When input is [24] the target is: 43
When input is [24, 43] the target is: 58
When input is [24, 43, 58] the target is: 5
When input is [24, 43, 58, 5] the target is: 57
When input is [24, 43, 58, 5, 57] the target is: 1
When input is [24, 43, 58, 5, 57, 1] the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
When input is [44] the target is: 53
When input is [44, 53] the target is: 56
When input is [44, 53, 56] the target is: 1
When input is [44, 53, 56, 1] the target is: 58
When input is [44, 53, 56, 1, 58

## Bigram Language Model

A Bigram model is a language model in which we predict the probability of the correctness of a sequence of words by just predicting the occurrence of the word “a” after the word “b”.

For eg: Let us take the sentence “I have a car”

probability of the sequence of these words will be

p(I) * p(have| I) * p(a | have, I) …….

In the Bigram model, we consider that the above equation will give the same result as that of

p(I) * p(have|I) * p(a|have) * p(car|a)

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets):
        
        # WE ARE PREDICTING WHAT COMES NEXT JUST BY USING THE INDIVIDUAL
        # IDENTITY OF A SINGLE TOKEN. We can do this since currently
        # the token are not talking to each other, they are not seein
        # any context except for that they are just seeing themselves
        
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        # The functional form of the cross entropy given by PyTorch
        # expects the channel dimension C to be in the second position,
        # it wants as input (B, C, T)
        B, T, C = logits.shape
        logits = logits.view(B*T, C) # We will stretch the Batch and Time dimensions
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(f"Cross entropy loss: {loss}")

torch.Size([32, 65])
Cross entropy loss: 4.878634929656982


Our vocab size is 65, so the loss should be about -ln(1/65)=4.17 , but we are getting 4.879. So, what this is telling us is that *the initial predictions* are not diffused, they have a little bit of entropy and so we are guessing wrong

In [14]:
# Adding generation
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
        
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # logits becomes (B, C)
            # apply Softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx  
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(f"Cross entropy loss: {loss}")

torch.Size([32, 65])
Cross entropy loss: 4.878634929656982


In [15]:
# GENERATE FROM THE MODEL

# creating a batch of just one element, and time will also be 1, 
# so, we have a (1, 1) tensor which contains a 0, of integer type (torch.long)
# THIS 0 IS HOW WE WILL KICK OF THE GENERATION, BECAUSE AT INDEX 0 WE HAVE
# THE *NEW LINE CHARACTER \n*, so it is a reasonable character to begin
# the generation
idx = torch.zeros((1, 1), dtype=torch.long)

# We are going to generate 100 tokens, and since it creates it by
# batches and we have just inserted one batch, we will have to extract
# it using [0]. 

# And then that just gives us time steps, which is just a one-dimensional
# array of all the indices which we will convert to simple python list 
# from our PyTorch tensor, so it can feed into our decode function and
# so convert those integers into text

print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


The generate function we have just created does not make much sense since we are feeding in all this, we are building out this context and we are concatenating it all and we are always feeding it all into the model. *But right now we are just using a simple Bigram Model*, so we are not using the entire context we are feeding it with, we are just using the previous token.

But later we will work with this function using the whole context

## Training the Bigram Model

In [16]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [17]:
batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.5727508068084717


In [18]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers je are!-e!
QLYotouciullle'z,
Thitertho s?
NDan'spererfo cist ripl chys er orlese;
Yo jehof h hecere ek? wferommot mowo soaf yoit, ince his, t, f at. fal whetrimy bupof tor atha Bu!
JOutho f cimimave.
NEDUSt cir selle p wie wede
Ro n apenor f'Y tover witys an sh d w t e w!
CEOntiretoaveE IINpe, theck. cung.
ORIsthies hacin benqurd bll, d a r w wistatsowor ath
Fivet bloll ang a-I theeancu,
LINCI'T:
Sarry t I Ane sze t
LCKI thit,
n.
Faure ds ppplirn!
meftou ow pring, avewist th;
TENTEMETCI gienco, An he waro whiougou he s i

### The mathematical trick in self-attention

In [19]:
# consider the following toy example:

torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

We have up to eight tokens here in a batch and these eight tokens are currently not talking to each other. And our goal is to make them talk to each other, we want to couple them.

We want to couple them in a very specific way: for instance, the token at the fifth location should not communicate with tokens in the sixth seventh and eighth locations. Because those are future tokens in the sequence. The token in the fifth location should only talk with the tokens from 1 to 4. So, *information only flows from previous context to the current timestep and we can not get any information from the future because we are about to try to predict the future*.

The easiest way for tokens to communicate is to just do an average of all the preceding elements. This is very simple and we loss lots of information about the location and value of each element.

We will make an average of all the previous tokens, and also at the current token.

In [29]:
# We want x[b, t] = mean_{i<=t} x[b, i]

# xbow = x bag of words
# The bag-of-words model is a model of text which
# uses a representation of text that is based on
# an unordered collection (or "bag") of words.

xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

In [30]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [31]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

We see how we are averaging tokens, the first line keeps the same since it is an average of just an element, then the second line values are (0.1808+-0.3596)/2=-0.0894 and (-0.07+-0.9152)/2=-0.4926 and so on.

This is doing what we want good, but it is very very inefficient. We can use *matrix multiplication* to be able to do this much more efficiently.

In [32]:
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


##### The first number in the top left position of tensor C is the first row of tensor A dot producted with the first column of tensor B, and since all the elements in A are ones, the we get a sum of each column of B

### torch.tril()
PyTorch has a function called tril which returns the lower triangular portion of the given tensor. So, if we use this in our A matrix, in C we will be ignoring the elements which are after the current index, so we have a trick for ignoring future tokens.

In [33]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


Currently we are doing sums since the non-zero elements in matrix A are ones, but we can also do average just by normalizing the rows of matrix A.

In [34]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [35]:
# wei == weigths
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [36]:
xbow2 = wei @ x # (T, T) @ (B, T, C) ---> (B, T, C)
# torch will see that the dimensions are not equal, so it will
# create a Batch dimension in the first element, and make the
# multiplication of our (T, T) matrix with each (T, C) of the
# Batch in the second element
xbow2

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

We will now do this in another way, using the Softmax function:
We initialize our tensor with all zeros, and then we use *masked_fill*. If we seet *tril == 0* and *float('-inf')*, we will make all the elements where tril is equal to zero become minus infinity.

We have our tril matrix, which is a lower triangular matrix, and we have our wei matrix initialized with zeros. We will see where we have zeros in the tril matrix and insert minus infinities in those positions *in the wei matrix*. So the tril matrix works kind of like a mask, or like telling where we should do each things, but the changes are done in the wei matrix.

In [43]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei_before_softmax = torch.clone(wei)
wei = F.softmax(wei, dim=1)

xbow3 = wei @ x

In [40]:
# We see how xbow2 and xbow1 are identical
torch.allclose(xbow3, xbow2)

True

In [44]:
# Since we are doing a softmax on each line,
# e^(-inf) will be zero, so they will not count,
# and e^0=1

print(wei_before_softmax)
print(wei)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
