<a href="https://colab.research.google.com/github/laxmipriyapadegal2024/LLM-from-Scratch-101/blob/main/LLMs103.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch,tiktoken
tokenizer=tiktoken.get_encoding("gpt2")   # byte-pair encoding

In [None]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text=f.read()
enc_text=tokenizer.encode(raw_text)           #converting our text to tokens
print(len(enc_text))
print(enc_text[:10])

5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


In [None]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self,txt,tokenizer,max_length,stride):
    self.input_ids=[]
    self.output_ids=[]

    token_ids=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_ids)-max_length,stride):
      input_chunk=token_ids[i:i+max_length]
      output_chunk=token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.output_ids.append(torch.tensor(output_chunk))

  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self,idx):
    return self.input_ids[idx],self.output_ids[idx]

In [None]:
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,
                         shuffle=True,drop_last=True,num_workers=0):
  tokenizer=tiktoken.get_encoding("gpt2")
  dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)
  dataloader=DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        num_workers=num_workers)
  return dataloader

In [None]:
vocab_size=50257
output_dim=256
token_embedding_layer=torch.nn.Embedding(vocab_size,input_dim) # Vector embedding initiations

In [None]:
max_length=4
stride=4
batch_size=8
dataloader=create_dataloader_v1(raw_text,batch_size=batch_size,
            max_length=max_length,stride=stride,shuffle=False) #input-target pairs
data_iter=iter(dataloader)
inputs,targets=next(data_iter)
print("Token IDs:\n",inputs)
print(inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
torch.Size([8, 4])


In [None]:
token_embeddings=token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
context_length=max_length
pos_embedding_layer=torch.nn.Embedding(context_length,output_dim)

In [None]:
#using absolute positional embeddings
pos_embeddings=pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
input_embeddings=token_embeddings+pos_embeddings
print(input_embeddings.shape)
print(input_embeddings)

torch.Size([8, 4, 256])
tensor([[[-1.0780, -2.4384, -1.1104,  ..., -0.0834, -0.7061, -0.8745],
         [-0.4684, -2.1987,  1.5940,  ...,  0.3222, -0.5426,  1.0070],
         [ 0.7537, -2.5357, -0.1171,  ..., -1.3008, -0.7795,  2.1786],
         [-0.4568,  1.7411,  0.3138,  ...,  1.1688, -0.7644,  0.3278]],

        [[ 0.0446, -2.3144, -0.5571,  ...,  2.9467, -0.8466, -1.7283],
         [-1.7615, -1.1404,  0.0710,  ..., -0.2187, -1.9814, -1.3629],
         [-1.2528, -3.8005, -0.2129,  ...,  0.0844,  0.2455,  1.5653],
         [ 1.3436,  0.0562,  0.1808,  ...,  0.1614, -1.7230, -1.5271]],

        [[ 1.5583, -1.4031, -2.2170,  ..., -0.4483,  1.0269, -0.2731],
         [-0.6121, -0.2826,  0.4304,  ..., -1.0857, -0.5784,  1.2861],
         [-2.3612, -3.5975, -2.1962,  ...,  0.5516,  1.3488,  2.3731],
         [ 0.5100,  0.6005, -0.2229,  ..., -0.4433, -0.8191, -1.0586]],

        ...,

        [[ 0.9519, -3.7512, -0.8602,  ...,  0.6066, -0.6035, -1.5930],
         [-2.8009, -1.1106, -1.85