Step 1: Creating Tokens

In [None]:
with open("the-verdict.txt","r",encoding="utf-8") as f:   # r is read
    raw_text = f.read()
print("total num of character:", len(raw_text))
print(raw_text[:99])   #print 1st 100 characters of the file

total num of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
# split text to obtain a list of token
import re    # regular expression -> split based on white spaces or any other character

text = "hello, world. This, is a text."
result = re.split(r'(\s)',text) #\s splits where white spaces r encountered

print(result)

['hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'text.']


In [None]:
result = re.split(r'[,.]',text)  #. and , are seperate tokens
print(result)
# another issue is it still contains white space character which is still counted as tokens

['hello', ' world', ' This', ' is a text', '']


In [None]:
result = [item for item in result if item.strip()]  #strip cut off white spaces
print(result)
# keeping white spaces is meaningful (eg: python code as dataset), we are removing just for memory advantages

['hello', ' world', ' This', ' is a text']


In [None]:
text = "hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)',text)   # all are seperate tokens now
result = [item.strip() for item in result if item.strip()]
print(result)

['hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [None]:
# convert the entire broad text to individual tokens
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
print(len(preprocessed))

4690


Step 2: Converting tokens to token id

In [None]:
# vocabulary contains unquie tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [None]:
# creating vocabulary (every token needs to be assign with token id)
vocab = {token:integer for integer, token in enumerate(all_words)}
# assign token to int values

In [None]:
for i, item in enumerate(vocab.items()):  #enumerate takes all the words and assign an integer to each word in alphabetical order
  print(item)
  if i >= 50:
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


encode method = sample text -> tokenized text -> token ids

decode method = token ids -> tokenized text -> sample text

In [None]:
class SimpletokenizerV1:
  def __init__(self,vocab):
    self.str_to_int = vocab  # encode
    self.int_to_str = {i:s for s, i in vocab.items()}  #s=token, i=token id #decode

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.:?!"()\'])', r'\1', text)
    return text

In [None]:
# eg for encoder
tokenizer = SimpletokenizerV1(vocab)

text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
# eg for decoder
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [None]:
text = "Hello, Tea is good?"
ids = tokenizer.encode(text)
print(ids)
# gave error as this is not included in the given dataset

KeyError: 'Hello'

In [None]:
# Adding special context tokens
# will modify tokenizer to handle unknown words., and implement a class SimpleTokenizerV2, to support new tokens
# V2 -> version 2

# at the end of text add <unk>  and  <endoftext>
# if we've a sentence in which last word is not in text then it will take the id of lunk
# when we're working with multiple text sources then we use endoftext
# when 1st text ends endoftext lg jata h then 2nd text end phr lg jata h likewise.. => this leads to more effective processing in llm

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])   # adding two tokens at last
vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [None]:
len(vocab.items())   #length increased by 2

1132

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):  #last 5 entries
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
# simple change in this version 2 tokenizer is adding unknown if word not present
class SimpletokenizerV2:
  def __init__(self,vocab):
    self.str_to_int = vocab  # encode
    self.int_to_str = {i:s for s, i in vocab.items()}  #s=token, i=token id #decode

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int    # if particular entry is not present in the vocabulary, the token assigned to that entry is unknown
        else "<|unk|>"
        for item in preprocessed
    ]

    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.:?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpletokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
tokenizer.encode(text)  # earlier we were getting error as hello was not present in the text

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [None]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

There are also some special tokens present-

[BOS] beginning of sequence: marks the start of a text, where a piece of context begins

[EOS] end of sentence: positioned at the of text, specially useful when concatenating multiple unrelated texts, similar to <|endoftext|>

[PAD] padding: to ensure same length to all text, shorter text are padded to normal length.

GPT just uses <|endoftext|> for simplicity

GPT doen't uses unk for unknown token, instead it uses byte pair encoding [BPE] tokenizer, which breaks down into subword units. eg: chased a word can be broken down into ch,as,ed

Byte Pair Encoding (BPE)

- comes under subword based tokenization

The BPE tokenizer covered in this section was used to train LLMs such as GPT-2, GPT-3, and the original model

In [None]:
# since implementing BPE can be relatively complicated, we'll use an Python open-source lib called tiktoken
# it implements BPE algo very efficiently based on source code in Rust
# tiktoken is a fast BPE tokenizer for use with OpenAI's models

!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m21.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.8.0


In [None]:
# we can instantiate BPE tokenizer from tiktoken
tokenizer = tiktoken.get_encoding("gpt2")   # it is like the one we implemented earlier SimpleTokenizerV2

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
# in word level tokenizer "someunknown" has given an error until unk was not given but here as it tokenize subword also it will be tokenized and no error

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
# converting token ids back to text using decode method
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


observations-

<|endoftext|> token is assigned a relatively large token ID, namely 50256

In fact, the BPE tokenizer, which was used to train models such as GPT-2, GPT-3, and the original chatgpt, has a total vocab size of 50,257, with <|endoftext|> being assigned the largest token ID.



In [None]:
# eg of how bpe tokenizer deals with unknown tokens
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


Creating I/P-Target Pairs

Implement a data loader that fetches the i/p target pairs using a sliding window.

In [None]:
# Using BPE tokenizer
with open("the-verdict.txt","r",encoding="utf-8") as f:   # r is read
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)  #encoding the entire raw text
print(len(enc_text))
# total no. of tokens in the training test => 5145

5145


In [None]:
# removing first 50 tokens from dataset as it results in slightly more interesting text passage
enc_sample = enc_text[50:]

In [None]:
# one of the easiest & most intuitive way to create ip-target pairs for the next word prediction task is to
# create two variables x and y, where x contains i/p tokens and y contains targets

context_size = 4 # this can be any num,means model is trained to look sequence of 4 words
# [1,2,3,4] target y is next 4 [2,3,4,5]   for 1 -> 2, for 1,2 -> 3, for 1,2,3 -> 4, for 1,2,3,4 -> 5

x = enc_sample[:context_size]   #contains the token id of encoded dataset
y = enc_sample[1:context_size+1]  #then shift the x array by 1

print(f"x: {x}")
print(f"y:     {y}")
# if i/p -> 290 then o/p -> 4920, if i/p -> 290,4920 then o/p -> 2241 and likewise

x: [290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


In [None]:
for i in range(1, context_size+1):   # 1 to 5
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(context, "----->", desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [None]:
for i in range(1, context_size+1):   # 1 to 5
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


FOR efficient data loader implementation, we'll use Pytorch's built-in Dataset and DataLoader classes.

step 1: tokenize the entire text

step 2: use a sliding window to chunk the book into converting sequences of max_length

step 3: return total num of rows in the dataset

step 4: return a single row from the dataset


In [None]:
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):   # we need stride to know how much to slide when to create the next i/p, o/p batch
        self.input_ids = []
        self.target_ids = []

        # tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # use a sliding window to chunk the book  into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]

            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):        # the data loader will look at getitem method then it'll create the i/p o/p pair
        return self.input_ids[idx], self.target_ids[idx]   # based on idx provides that particular row of i/p o/p

In [None]:
# now dataloader comes to picture => it help us to do parallel processing
# batch size means how many cpu processors we want to run parallely
# stride is when we create i/p o/p batches how much we need to skip before we create a next batch
# num_of_workers is how many threads u want to split the code on cpu for let's say ||rl processing
def create_data_loader_v1(txt, tokenizer, batch_size=4, max_length=256,
                          stride=128, shuffle=True, drop_last=True,   # drop last=true drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training
                          num_workers=0):
    # initialize the tokenizer
    # tokenizer = tiktoken.get_encoding("gpt2")

    # create the dataset
    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    # create dataloader
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                             drop_last=drop_last, num_workers=num_workers)

    return data_loader

In [None]:
# now we'll test the dataloader with a batch size of 1 for an LLM with a context size of 4,
# This will develop an intuition of how the GPTDatasetV1 class and the create_dataloader_v1 func work together

with open("the-verdict.txt","r",encoding="utf-8") as f:   # r is read
    raw_text = f.read()

In [None]:
import torch
import tiktoken
print("PyTorch version:", torch.__version__)

tokenizer = tiktoken.get_encoding("gpt2")

dataloader = create_data_loader_v1(
    raw_text, tokenizer, batch_size=1, max_length=4, stride=1,
    shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)
# i/p tensor and the o/p tensor
# since max_length is set to 4 each of the two tensors contains 4 token IDs

PyTorch version: 2.5.1+cu124
[[tensor([40]), tensor([367]), tensor([2885]), tensor([1464])], [tensor([367]), tensor([2885]), tensor([1464]), tensor([1807])]]


In [None]:
second_batch = next(data_iter)
print(second_batch)
# shifted by 1 place (sliding window approach)

[[tensor([367]), tensor([2885]), tensor([1464]), tensor([1807])], [tensor([2885]), tensor([1464]), tensor([1807]), tensor([3619])]]


In [None]:
dataloader = create_data_loader_v1(raw_text, tokenizer, batch_size=8, max_length=4, stride=1)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("inputs:", inputs)
print("targets:", targets)
# batch size is 8 so i/p & o/p tensor has 8 i/ps, o/ps
# also not overlappin as stride is 4  //prevents overfitting

inputs: [tensor([   13,   621,  4808, 41379,   887,  6777,   326,    11]), tensor([  198,   611,  5562,   293,   345,    13, 11542,   290]), tensor([198, 314,  62, 373, 655, 632, 373, 287]), tensor([3347, 1549,  465,   11,  531,  373, 1813,  465])]
targets: [tensor([  198,   611,  5562,   293,   345,    13, 11542,   290]), tensor([198, 314,  62, 373, 655, 632, 373, 287]), tensor([3347, 1549,  465,   11,  531,  373, 1813,  465]), tensor([4376, 1239, 2106,  287,  438,  407,  502,  898])]


Toy demo of vector embedding

In [None]:
# import trained model
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-300")   # this model can take any word as i/p and convert it to vectors(300 dimensional vectors)



In [None]:
# example of a word as a vector
word_vectors = model

# vector embedding of a word look like
print(word_vectors['computer'])   # 300 dimensional vector of a comp

[-2.7628e-01  1.3999e-01  9.8519e-02 -6.4019e-01  3.1988e-02  1.0066e-01
 -1.8673e-01 -3.7129e-01  5.9740e-01 -2.0405e+00  2.2368e-01 -2.6314e-02
  7.2408e-01 -4.3829e-01  4.8886e-01 -3.5486e-03 -1.0006e-01 -3.0587e-01
 -1.5621e-01 -6.8136e-02  2.1104e-01  2.9287e-01 -8.8861e-02 -2.0462e-01
 -5.7602e-01  3.4526e-01  4.1390e-01  1.7917e-01  2.5143e-01 -2.2678e-01
 -1.0103e-01  1.4576e-01  2.0127e-01  3.1810e-01 -7.8907e-01 -2.2194e-01
 -2.4833e-01 -1.5103e-02 -2.0050e-01 -2.6441e-02  1.8551e-01  3.3782e-01
 -3.3543e-01  8.6117e-01 -4.7083e-02 -1.7009e-01  3.0438e-01  9.4119e-02
  3.2435e-01 -8.1171e-01  8.8966e-01 -3.9149e-01  1.6828e-01  1.4316e-01
  3.6339e-03 -6.4557e-02  4.5777e-02 -3.2248e-01  4.8943e-02  1.6817e-01
  6.8344e-02  5.4227e-01  1.2493e-01  6.9742e-01 -3.7194e-02  3.3080e-01
 -4.2194e-01  3.3970e-01  2.7646e-01 -1.6003e-02 -2.1827e-01  4.4535e-01
  3.5379e-01 -2.2089e-02  2.1375e-01  4.3267e-01 -3.2897e-01  9.6165e-02
  3.1265e-01 -3.0528e-01  2.6126e-01 -6.5364e-01 -7

In [None]:
print(word_vectors['cat'].shape)
# every word is encoded into 300 dimensional vector

(300,)


King + woman - man = ? (queen should be the ans)

In [None]:
# example of using most_similar
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))

[('queen', 0.6713277101516724), ('princess', 0.5432624816894531), ('throne', 0.5386103987693787), ('monarch', 0.5347574949264526), ('daughter', 0.49802514910697937), ('mother', 0.49564430117607117), ('elizabeth', 0.4832652509212494), ('kingdom', 0.47747090458869934), ('prince', 0.4668239951133728), ('wife', 0.46473270654678345)]


In [None]:
# checking similarity b/w a few pair of words
# eg of calculating similarity
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))

0.6998663
0.6336469
0.6660684
0.8272891
0.73385984
0.24615553


In [None]:
# most similar words
print(word_vectors.most_similar('tower', topn=5))

[('towers', 0.7919201850891113), ('skyscraper', 0.6111851930618286), ('building', 0.5957720279693604), ('spire', 0.5896912813186646), ('tallest', 0.5712063312530518)]


In [None]:
# let us see the vector similarity
import numpy as np
# words to compare
word1 = 'man'
word2 = 'woman'

word3 = 'semiconductor'
word4 = 'earthworm'

word5 = 'nephew'
word6 = 'niece'

# calculate the vector diff
vector_diff1 = model[word1] - model[word2]
vector_diff2 = model[word3] - model[word4]
vector_diff3 = model[word5] - model[word6]

# calculate magnitude of the vector diff
magnitude1 = np.linalg.norm(vector_diff1)
magnitude2 = np.linalg.norm(vector_diff2)
magnitude3 = np.linalg.norm(vector_diff3)

# print magnitude of diff
print(magnitude1)   # man & woman
print(magnitude2)   # semiconductor & earthworm
print(magnitude3)   # nephew & niece

4.7539396
10.218706
3.9361057


In [None]:
# creating token embeddings
input_ids = torch.tensor([2,3,5,1])  # representing everything as tensor

In [None]:
# let's take vocab size 6 and embeddings of size 3
# 3 cols & 6 rows
vocab_size = 6
output_dim = 3   # every token will be converted into vector of 3 dimensions

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
print(embedding_layer.weight)
# these are the initial weighs which needs to be optimized
# embedding is also called as simple look up table as we can see from below

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [None]:
print(embedding_layer(torch.tensor([3])))   # vector for id=3

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [None]:
print(embedding_layer(input_ids))
# row no. 3,4,6,2 as above i/p_ids we've specified

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


Positional embeddings (encoding word positions)

In [None]:
vocab_size = 50257  # no. of rows
output_dim = 256    # smaller than GPT-3  # no. of cols

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4     # 4 i/p tokens will be used to predict the next word
dataloader = create_data_loader_v1(
    raw_text, tokenizer, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [None]:
print("token ids:\n", inputs)
print("\ninput shape:\n", [len(row) for row in inputs])  # Get the length of each row in the list
# 8x4

token ids:
 [tensor([   40,  1807, 10899, 15632,   922,   568,  1049,   284]), tensor([ 367, 3619, 2138,  438, 5891,  340, 5975, 3285]), tensor([2885,  402,  257, 2016, 1576,  373,  284,  326]), tensor([1464,  271, 7026,  257,  438,  645,  502,   11])]

input shape:
 [8, 8, 8, 8]


In [None]:
# Convert the list of lists (inputs) into a 2D tensor of type long
input_tensor = torch.stack(inputs, dim=0).type(torch.long)  # Use torch.stack to combine tensors

# Now pass the correct tensor to the embedding layer
token_embeddings = embedding_layer(input_tensor)
print(token_embeddings.shape)

torch.Size([4, 8, 256])


In [None]:
context_length = max_length
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
pos_embeddings = pos_embeddings_layer(torch.arange(max_length))  # 0, 1, ... upto max i/p length -1
print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
# Assuming token_embeddings has shape (batch_size, sequence_length, embedding_dim) which is (8, 4, 256)
# and pos_embeddings has shape (sequence_length, embedding_dim) which is (4, 256)
# We need to expand pos_embeddings to have shape (1, sequence_length, embedding_dim) which is (1, 4, 256)
# before broadcasting it to the shape of token_embeddings

input_embeddings = token_embeddings + pos_embeddings[:, None, :].expand(token_embeddings.shape)
# expand pos_embeddings to have the same batch size as token_embeddings

print(input_embeddings.shape)

torch.Size([4, 8, 256])


Implemented a simplified attention mechanism

In [None]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your
     [0.55, 0.87, 0.66],  # journey
     [0.57, 0.85, 0.64],  # starts
     [0.22, 0.58, 0.33],  # with
     [0.77, 0.25, 0.10],  # one
     [0.05, 0.80, 0.55]]  # step
)

we can use dot product to find the similarity between between the two vectors as if 0 angle will be there then then cos 0 is 1 means most similar (higher the dot product more align the vectors are, lower means not align)

In [None]:
query = inputs[1]  # 0 based indexing, 2nd i/p token is the query

attn_scores_2 = torch.empty(inputs.shape[0])
for i,x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) # dot product b/w every i/p and query vector

print(attn_scores_2)
# 2nd, 3rd and 6th value have the largest attention score

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


normalize scores that we computed previosly

main goal of normalization is to obtain attention weights that sum upto 1

In [None]:
attn_weights_2_tmp = attn_scores_2/attn_scores_2.sum()

print("Attention weights:", attn_weights_2_tmp)
print("sum:", attn_weights_2_tmp.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
sum: tensor(1.0000)


In [None]:
def softmax(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax(attn_scores_2)

print("attention weights:", attn_weights_2_naive)
print("sum:", attn_weights_2_naive.sum())

attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum: tensor(1.)


In [None]:
query = inputs[1] # 2nd i/p token is the query

context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2_naive[i] * x_i

print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


In [None]:
attn_scores = torch.empty(6,6)
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i,j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [None]:
# this one is same as upper one
attn_scores = inputs @ inputs.T      # i/p and its transpose
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [None]:
# normalization
attn_weights = torch.softmax(attn_scores, dim=-1)  # by setting dim=-1 we're normalizing the attn_scores tensor
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [None]:
row_2_sum = sum([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
print("row 2 sum:", row_2_sum)
print("all row sums:", attn_weights.sum(dim=-1))

row 2 sum: 1.0
all row sums: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [None]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


In [None]:
print("previous 2nd context vector:",context_vec_2)

previous 2nd context vector: tensor([0.4419, 0.6515, 0.5683])


Query key and the value

In [None]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your
     [0.55, 0.87, 0.66],  # journey
     [0.57, 0.85, 0.64],  # starts
     [0.22, 0.58, 0.33],  # with
     [0.77, 0.25, 0.10],  # one
     [0.05, 0.80, 0.55]]  # step
)

In [None]:
x_2 = inputs[1]  #A  #corresponds to journey
d_in = inputs.shape[1]  #B
d_out = 2  #C

# GPT-like models have same i/p o/p dimensions usually
# but for illustration purpose we choose d_in=3, d_out=2

In [None]:
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [None]:
print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [None]:
print(W_key)

Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])


In [None]:
print(W_value)

Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [None]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)   # for journey

tensor([0.4306, 1.4551])


In [None]:
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value
print("keys.shape:", keys.shape)
print("queries.shape:", queries.shape)
print("values.shape:", values.shape)
# 6 i/p tokens and for each i/p vector we've a 2D key vector, 2D query and value vector

keys.shape: torch.Size([6, 2])
queries.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])


In [None]:
keys_2 = keys[1]
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.8524)


In [None]:
attn_scores_2 = query_2 @ keys.T   # dot product between 2nd query and keys metrics
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [None]:
attn_scores = queries @ keys.T
print(attn_scores)

tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


In [None]:
# attention weights (by dividing them by square root of embedding dimen of keys)
d_k = keys.shape[-1]   # -1 cuz lokking at the col
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)
print(d_k)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
2


In [None]:
import torch

# define the tensor
tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

# apply softmax without scaling
softmax_result = torch.softmax(tensor, dim=-1)
print("softmax without scaling:", softmax)

# multiply the tensor by 8 then apply softmax
scaled_tensor = tensor*8;
softmax_scaled_result = torch.softmax(scaled_tensor, dim=-1)
print("softmax with scaling:", softmax_scaled_result)

softmax without scaling: <function softmax at 0x7ec75fbe67a0>
softmax with scaling: tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


In [None]:
# why sqrt
# to make variance of the dot product stable
# dot product of Q and K increases variance because multiplying two random nums increases the variance
# increase in variance grows with the dimension
# dividing by sqrt (dimension) keeps variance close to 1

In [None]:
import numpy as np

def compute_variance(dim, num_trials=1000):
    dot_products = []
    scaled_dot_products = []

    for _ in range(num_trials):
        # Generate two random vectors with the specified dimension
        q = np.random.randn(dim)
        k = np.random.randn(dim)

        dot_product = np.dot(q, k)
        dot_products.append(dot_product)

        scaled_dot_product = dot_product / np.sqrt(dim)   # sqrt makes sure that after scaling variance keeps close to 1
        scaled_dot_products.append(scaled_dot_product)

    variance = np.var(dot_products)
    scaled_variance = np.var(scaled_dot_products)

    return variance, scaled_variance

# for dimension 5
variance_before_5, variance_after_5 = compute_variance(5)
print(f"variance before scaling (dim=5): {variance_before_5}")
print(f"Variance before scaling (dim=5): {variance_after_5}")

# for dimension 20
variance_before_20, variance_after_20 = compute_variance(20)
print(f"variance before scaling (dim=20): {variance_before_20}")
print(f"Variance before scaling (dim=20): {variance_after_20}")

variance before scaling (dim=5): 4.687361557162383
Variance before scaling (dim=5): 0.9374723114324767
variance before scaling (dim=20): 20.464034999870393
Variance before scaling (dim=20): 1.0232017499935198


In [None]:
# for single context vectors
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3061, 0.8210])


In [None]:
# implementing a compact self attention python class  // to compute all the context vectors
import torch.nn as nn

class selfattention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)   #**0.5 sqrt h

        context_vec = attn_weights @ values
        return context_vec

In [None]:
torch.manual_seed(123)
sa_v1 = selfattention_v1(d_in, d_out)
print(sa_v1(inputs))
# each row corresponds to context token, 1st row for 1st token and likewise

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [None]:
# improving selfattention_v1 further by utilizing pytorch's nn.linear layers as it has optimized weight initialization scheme
import torch.nn as nn

class selfattention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)   #**0.5 sqrt h

        context_vec = attn_weights @ values
        return context_vec

In [None]:
torch.manual_seed(789)
sa_v2 = selfattention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


Hiding future words with causal atteniton

=> main purpose of causal attention is to not have any influence of future token

In [None]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your
     [0.55, 0.87, 0.66],  # journey
     [0.57, 0.85, 0.64],  # starts
     [0.22, 0.58, 0.33],  # with
     [0.77, 0.25, 0.10],  # one
     [0.05, 0.80, 0.55]]  # step
)

In [None]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attn_scores = queries @ keys.T
attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
print(attn_weights)
# here entries are already influenced by the other entries

tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [None]:
# generate a mask
# triu => upper triangular, lower all 0
# tril => lower triangular, upper all 0
context_length = attn_scores.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [None]:
masked_simple = attn_weights * mask_simple
print(masked_simple)
# every row sum upto 1

tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.0000, 0.0000],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<MulBackward0>)


In [None]:
row_sums = masked_simple.sum(dim=1, keepdim=True)
masked_simple_norm = masked_simple/row_sums
print(masked_simple_norm)
# above we've applied softmax and here again we're dividing with the row_sum which leads to data leakage problem

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<DivBackward0>)


In [None]:
# cancelling the influence of future tokens by introducing -ve infinity => no influence of future tokens now
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)

tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)


In [None]:
attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)
# everything is satisfied and rows also sunm upto one

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


masking additional attention weights with dropout

In [None]:
example = torch.ones(6,6)
print(example)

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])


In [None]:
# we'll use a dropout rate of 50%, means masking out half of the attention weights
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)    # on an avg it will swiitch of 50% of the weights, and rescale all the other weights by that much amt
example = torch.ones(6,6)
print(dropout(example))

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])


In [None]:
torch.manual_seed(123)
print(dropout(attn_weights))

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.8966, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6206, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4921, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4350, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3327, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


In [None]:
batch = torch.stack((inputs,inputs),dim=0)
print(batch.shape)
# batch_size, no. of token, vector embedding dimension

torch.Size([2, 6, 3])


In [None]:
class causalattention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask',torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1,2)
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ values
        return context_vec

# this makes all the upper part zero and all the rows sum upto 1

In [None]:
torch.manual_seed(123)
context_length = batch.shape[1]
ca = causalattention(d_in, d_out, context_length, 0.0)   # 0.0 is the dropout
context_vec = ca(batch)
print("context_vecs.shape:",context_vec.shape)

context_vecs.shape: torch.Size([2, 6, 2])


In [None]:
print(context_vec)
# 1st i/p
# 2nd i/p

tensor([[[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]],

        [[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]]], grad_fn=<UnsafeViewBackward0>)


Extending single head attention to multi-head attention

In [None]:
# we can achieve this by implementing a simple multihead attention wrapper class that stacks multiple instances of our previously implemented causal attention module
class MultiHeadAttentionWrapper (nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [causalattention(d_in, d_out, context_length, dropout, qkv_bias)
            for _ in range(num_heads)]
        )

    def forward(self,x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

# if we use multi with two attention head (num_head=2) & causal output dim with two (d_out*num_head=4)

In [None]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your
     [0.55, 0.87, 0.66],  # journey
     [0.57, 0.85, 0.64],  # starts
     [0.22, 0.58, 0.33],  # with
     [0.77, 0.25, 0.10],  # one
     [0.05, 0.80, 0.55]]  # step
)

batch = torch.stack((inputs,inputs),dim=0)
print(batch.shape)

torch.Size([2, 6, 3])


In [None]:
torch.manual_seed(123)
context_length = batch.shape[1]
d_in, d_out = 3,2
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

# d_out is 2 but 4 colms qki two attention head h (num_heads=2) so two two are aggregated

tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],

        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 6, 4])


Implementing multi head attention with weight splits

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # shape: (b_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 3d is converted to 4d
        # unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        # (b, num_tokens, num_head, head_dim) -> (b, num_heads, num_tokens, head_dim)
        # grouping w.r.t no. of heads
        keys = keys.transpose(1,2)   # the index we need to transpose is 1, 2
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [None]:
torch.manual_seed(123)
inputs = torch.tensor(
    [[0.43,0.15,0.89,0.55,0.87,0.66],  #row 1
     [0.57,0.85,0.64,0.22,0.58,0.33],  #row 2
     [0.77,0.25,0.10,0.05,0.80,0.55]]  #row 3
)

batch = torch.stack((inputs,inputs),dim=0)
print(batch.shape)

batch_size, context_length, d_in = batch.shape
d_out = 6
mha = MultiHeadAttention(d_in,d_out,context_length,0.0,num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

torch.Size([2, 3, 6])
tensor([[[ 0.1569, -0.0873,  0.0210,  0.0215, -0.3243, -0.2518],
         [ 0.1117, -0.0547,  0.0406, -0.0213, -0.3251, -0.2993],
         [ 0.1196, -0.0491,  0.0318, -0.0635, -0.2788, -0.2578]],

        [[ 0.1569, -0.0873,  0.0210,  0.0215, -0.3243, -0.2518],
         [ 0.1117, -0.0547,  0.0406, -0.0213, -0.3251, -0.2993],
         [ 0.1196, -0.0491,  0.0318, -0.0635, -0.2788, -0.2578]]],
       grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 3, 6])


Implementing a GPT model from scratch to generate text

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocabulary size
    "context_length": 1024,   # context length
    "emb_dim": 768,           # embedding dimension
    "n_heads": 12,            # no. of attention heads
    "n_layers": 12,           # no. of layers
    "drop_rate": 0.1,         # dropout rate
    "qkv_bias": False         # query-key-value bias
}

GPT Architecture part 1: Dummy GPT model class

In [None]:
import torch
import torch.nn as nn   # for embedding

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # placeholder for transformer
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # placeholder for layer norm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)     # transformer block
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
       super().__init__()

    def forward(Self,x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

In [None]:
# step 1: tokenization
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [None]:
# step 2: create an instance of dummyGPTModel
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("OUTPUT SHAPE",logits.shape)
print(logits)

OUTPUT SHAPE torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


GPT Architecture Part 2: layer normalization

In [None]:
torch.manual_seed(123)
batch_example = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [None]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("mean:",mean)
print("variance:",var)

mean: tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
variance: tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [None]:
out_norm = (out - mean)/torch.sqrt(var)    #normalized
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("normalized layer outputs:\n",out_norm)
print("mean:",mean)
print("variance:",var)

normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
mean: tensor([[9.9341e-09],
        [1.9868e-08]], grad_fn=<MeanBackward1>)
variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [None]:
torch.set_printoptions(sci_mode=False)
print("mean: ", mean)
print("variance: ", var)

mean:  tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
variance:  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [None]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)   # epsilion is to prevent division by zero during normalization
    return self.scale * norm_x + self.shift

In [None]:
print(batch_example)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])


In [None]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("mean:",mean)
print("variance:",var)
# normalizes such as mean as 0 and variance as 1

mean: tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


GPT Architecture part 3: Feedforward neural network with GELU Activation

In [None]:
# GELU implementation function approximation used by GPT-2

class GELU(nn.Module):
  def __init__(self):
      super().__init__()

  def forward(self,x):
      return 0.5 * x * (1 + torch.tanh(
          torch.sqrt(torch.tensor(2.0/torch.pi)) *
           (x + 0.044715 * torch.pow(x, 3))
      ))

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocabulary size
    "context_length": 1024,   # context length
    "emb_dim": 768,           # embedding dimension
    "n_heads": 12,            # no. of attention heads
    "n_layers": 12,           # no. of layers
    "drop_rate": 0.1,         # dropout rate
    "qkv_bias": False         # query-key-value bias
}

In [None]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),  # expansion
            GELU(),                     # activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])   # contraction
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
print(GPT_CONFIG_124M["emb_dim"])

768


In [None]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)
# x will have 2 batches, each batch will have 3 tokens and embeddings dim of each token is going to be 768

torch.Size([2, 3, 768])


GPT Architecture part 4: Shortcut connections

In [None]:
class ExampleDeepNetwork1(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            layer_out = layer(x)
            if self.use_shortcut and x.shape == layer_out.shape:
                x = x + layer_out
            else:
                x = layer_out
        return x

In [None]:
# initialize a neural network without shortcut connection
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNetwork1(layer_sizes, use_shortcut=False)

In [None]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173584925942123
layers.1.0.weight has gradient mean of 0.00012011159560643137
layers.2.0.weight has gradient mean of 0.0007152040489017963
layers.3.0.weight has gradient mean of 0.0013988736318424344
layers.4.0.weight has gradient mean of 0.005049645435065031


In [None]:
def print_gradients(model,x):
    # forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # calculate loss based on how close the target & o/p are
    loss = nn.MSELoss()
    loss = loss(output,target)

    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [None]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNetwork1(layer_sizes, use_shortcut=True)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22169791162014008
layers.1.0.weight has gradient mean of 0.20694105327129364
layers.2.0.weight has gradient mean of 0.32896995544433594
layers.3.0.weight has gradient mean of 0.2665732204914093
layers.4.0.weight has gradient mean of 1.3258540630340576


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [None]:
torch.manual_seed(123)
model = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("input shape:",x.shape)
print("output shape:",output.shape)

input shape: torch.Size([2, 3, 768])
output shape: torch.Size([2, 3, 768])


GPT Architecture part 6: Entire GPT Model Architecture implementation

In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("INPUT batch:\n",batch)
print("OUTPUT SHAPE",out.shape)
print(out)

INPUT batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
OUTPUT SHAPE torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params: ,}")

Total number of parameters:  163,009,536


In [None]:
print("token embedding layer shape:",model.tok_emb.weight.shape)
print("output layer shape:",model.out_head.weight.shape)

token embedding layer shape: torch.Size([50257, 768])
output layer shape: torch.Size([50257, 768])


In [None]:
# removing the o/p layer parameter count from the total parameter count
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2: ,}")

Number of trainable parameters considering weight tying:  124,412,160


In [None]:
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024*1024)
print(f"Total size of the model parameters: {total_size_mb: .2f} MB")

Total size of the model parameters:  621.83 MB


GPT Architecture part 7: Generating text from o/p tokens

In [None]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop current context if it exceeds the supported context size
        # eg: if LLM supports only 5 tokens and the contex size is 10
        # then only last 5 token are used as context
        idx_cond = idx[:, -context_size:]

        # get the predictions
        with torch.no_grad():
            logits = model(idx_cond)  ## (batch, n_tokens, voacb_size)

        # focus only on the last time step
        # (batch, n_tokens, vocab_szie) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # get the index of the vocab entry with the highest prob value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [None]:
start_context = "Hello, I am "
encoded = tokenizer.encode(start_context)
print("encoded:",encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:",encoded_tensor.shape)

encoded: [15496, 11, 314, 716, 220]
encoded_tensor.shape: torch.Size([1, 5])


In [None]:
model.eval()
model = GPTModel(GPT_CONFIG_124M)
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("output:",out)
print("output length:",len(out[0]))

output: tensor([[15496,    11,   314,   716,   220, 13966, 21091, 35022, 33648,  5924,
         42740, 43832, 40041, 26479, 36792]])
output length: 15


In [None]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am occジ rubbishGamer experiencedugenDbLair filmmaker ;;


Using GPT to generate text

Calculating the text generation loss: cross-entropy and perplexity

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocabulary size
    "context_length": 256,   # context length shortened (orig: 1024)
    "emb_dim": 768,           # embedding dimension
    "n_heads": 12,            # no. of attention heads
    "n_layers": 12,           # no. of layers
    "drop_rate": 0.1,         # dropout rate
    "qkv_bias": False         # query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();          # disable dropout during inference

In [None]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor =  torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("output text:\n", token_ids_to_text(token_ids, tokenizer))

output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [None]:
inputs = torch.tensor(([16833, 3626, 6100],   # every effort moves
                       [40,    1107, 588]))   # I really like

targets = torch.tensor(([3626, 6100, 345],    # effort moves you
                        [1107, 588,  11311])) # really like chocolate

In [None]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
print(probas.shape)

In [None]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("token ids:\n",token_ids)

In [None]:
print(f"targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"targets batch 2: {token_ids_to_text(token_ids[0].flatten(),tokenizer)}")

Cross-entropy loss

In [None]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)
# p11, p12, p13
# p21, p22, p23

In [None]:
# compute logarithm of all token probabilities
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

In [None]:
# calculate the avg probability for each token
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

In [None]:
meg_avg_log_probas = avg_log_probas * -1
print(meg_avg_log_probas)

In [None]:
logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()

print("flattened logits",logits_flat.shape)
print("flattened targets",targets_flat.shape)

In [None]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

Perplexity

In [None]:
perplexity = torch.exp(loss)
print(perplexity)