In [10]:
## Tokenizer
## DataLoader
import torch
from torch.utils.data import Dataset,DataLoader
from pathlib import Path
import tiktoken
DATASET_PATH  = Path("./datasets")
sample_chat_path = DATASET_PATH/"chat.txt"
tokenizer = tiktoken.get_encoding("gpt2")

In [11]:
## constants
CONTEXT_LENGTH=256
EMBEDDING_DIMENSION=256
POSITION_EMEBEDDING_DIMENSION=256
BATCH_SIZE=4
STRIDE=256
VOCAB_SIZE=tokenizer.n_vocab

## Data sampling with a  sliding window
- Way to create an input-target pairs for the next-word prediction task

In [13]:
with open(sample_chat_path,"r",encoding="utf-8") as f:
    text = f.read()
encoded_text = tokenizer.encode(text)


In [14]:

class ChatDataSet(Dataset):
    def __init__(self,txt,tokenizer,context_length,stride):
        # tokenizer is the tokenizer object
        self.tokenizer = tokenizer
        # input ids and output ids are the length of tensors where each row is a tokenized text of length of context length
        self.input_ids = []
        self.output_ids =[]
        token_ids = self.tokenizer.encode(txt)
        token_ids_length = len(token_ids)
        # Creation of dataset each row contains the tokenized text of the context length
        # Stride represents the number of tokens to skip
        for i in range(0,token_ids_length-context_length,stride):
            # Here we are generating all the possible input pair and output pair
            input_chunk = token_ids[i:i+context_length]
            output_chunk  = token_ids[i+1:i+context_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.output_ids.append(torch.tensor(output_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,index):
        return self.input_ids[index],self.output_ids[index]
    
## DataLoader

def create_dataloader_v1(txt:str,batch_size=4,context_length=256,stride=256,shuffle = True):
    dataset = ChatDataSet(txt,tokenizer,context_length,stride)
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle)
    return dataloader

        

In [15]:
data_loader = create_dataloader_v1(text,batch_size = BATCH_SIZE,context_length = CONTEXT_LENGTH,stride=STRIDE,shuffle=True)


## Initial token embeddings

- output_dim = 256
- vocab_size = 50257
- shape of the embedding_layer = (vocab_size,output_dim) = (50257,256)
- shape of the positional embedding layer = (context_length,output_dim)

In [17]:
token_embedding_layer = torch.nn.Embedding(VOCAB_SIZE,EMBEDDING_DIMENSION)
positional_embedding_layer = torch.nn.Embedding(CONTEXT_LENGTH,POSITION_EMEBEDDING_DIMENSION)
pos_embeddings = positional_embedding_layer(torch.arange(CONTEXT_LENGTH))

In [18]:
#Sample example of data passing
data_iter = iter(data_loader)
first_batch  = next(data_iter)

In [55]:
first_batch[0].shape

torch.Size([4, 256])

In [20]:
token_embeddings = token_embedding_layer(input)
input_to_LLM = token_embeddings+pos_embeddings

In [57]:
input_to_LLM.shape

torch.Size([4, 256, 256])

## Chapter - 3 (Coding attention Mechanisms)

- Exploring the reasons for using attention mechanisms in neural networks
- Masking randomly selected attention weights with dropout to reduce over-fitting

In [22]:
inputs = torch.tensor([[0.43,0.15,0.89], # Your 
                       [0.55,0.87,0.66], # journey
                       [0.57,0.85,0.64], # starts
                       [0.22,0.58,0.33], # with
                       [0.77,0.25,0.10], # one
                       [0.05,0.80,0.55]])# step

In [23]:
# Calculating the dot poduct ( Essentially checking how two embeddings are similar)
attention_weights = inputs@inputs.T
# Normalize the score with softmax
normalized_weights_with_softmax  = torch.softmax(attention_weights,dim=1)
# Get the weighted representation of the token with relation to the other token
context_vector = normalized_weights_with_softmax@inputs

### This is the end of simple attention mechanisms

### Start of computing the attention weights step by step

In [25]:
## A simple step by step implementaion of the weighted attention mechanism
d_in = inputs.shape[1]
d_out = 2

In [26]:
torch.manual_seed(123)
w_query = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=True)
w_key = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=True)
w_value = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=True)
queries = inputs@w_query
keys = inputs@w_key
values = inputs@w_value
unnormalized_attention_score =queries@keys.T
normalized_attention_score = torch.softmax(unnormalized_attention_score/d_out**0.5,dim=1)
context_vector = normalized_attention_score@values

### Implementing a compact self-attention Python class

In [28]:
import torch.nn as nn
class SelfAttentionV1(nn.Module):
    def __init__(self,d_in,d_out):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Parameter(torch.rand(d_in,d_out))
        self.w_key = nn.Parameter(torch.rand(d_in,d_out))
        self.w_value = nn.Parameter(torch.rand(d_in,d_out))
    def forward(self,x):
        queries  = x@self.w_query
        keys = x@self.w_key
        values = x@self.w_value
        # Calcualte the scores 
        attn_scores = queries@keys.T
        attn_weights = torch.softmax(attn_scores/(self.d_out**0.5),dim =-1)
        return attn_weights@values

In [29]:
class CasualAttention(nn.Module):
    def __init__(self,d_in,d_out,attn_bias=False):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Linear(d_in,d_out,bias=attn_bias)
        self.w_key = nn.Linear(d_in,d_out,bias=attn_bias)
        self.w_value = nn.Linear(d_in,d_out,bias=attn_bias)
    def forward(self,x):
        context_length = x.shape[0]
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)
        #Calculate the scores
        attn_scores = queries@keys.T
        #  Masking the weights only tells LLM to refer the previous context to generate the next token
        mask = torch.triu(torch.ones(context_length,context_length),diagonal=1)
        masked_attn_scores = attn_scores.masked_fill(mask==1,float("-inf"))
        attn_weights = torch.softmax(masked_attn_scores/(self.d_out**0.5),dim=-1)
        print(attn_weights)
        return attn_weights@values
    

## Implementing the casual attention

Here we will make the weights of the forward matrix to zero to tell the llm only see the previous tokens to generate the current token



In [31]:
ca = CasualAttention(3,2)
print(ca(inputs))

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4400, 0.5600, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2830, 0.3580, 0.3590, 0.0000, 0.0000, 0.0000],
        [0.2264, 0.2579, 0.2583, 0.2574, 0.0000, 0.0000],
        [0.1903, 0.2024, 0.2026, 0.1997, 0.2051, 0.0000],
        [0.1408, 0.1715, 0.1718, 0.1717, 0.1758, 0.1684]],
       grad_fn=<SoftmaxBackward0>)
tensor([[0.4772, 0.1063],
        [0.5891, 0.3257],
        [0.6202, 0.3860],
        [0.5478, 0.3589],
        [0.5321, 0.3428],
        [0.5077, 0.3493]], grad_fn=<MmBackward0>)


## Masking additional attention weights with dropout


## Implementing a compact casual attention class


In [34]:
class CasualAttentionWithBatch(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,qkv_bias):
        super().__init__()
        # d_out represents the total output dimension , each head contributes to subset of dimension
        self.d_out = d_out
        # This is the query layer we were using having the dimension (input dimension is the same as the embedding dimension of the input)
        self.w_query = nn.Linear(d_in,d_out,qkv_bias)
        # This is the key layer we were using having the dimension (input dimension is the same as the embeding dimension of the input)
        self.w_key = nn.Linear(d_in,d_out,qkv_bias)
        # This is the value layer we were using having the dimension (input dimension is the same as the embedding dimension of the input)
        self.w_value = nn.Linear(d_in,d_out,qkv_bias)
        self.dropout = nn.Dropout(p = dropout)
        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))
    def forward(self,x):
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)
        attention_scores = keys@queries.transpose(1,2)
        attention_scores.masked_fill(self.mask==1,-torch.inf)
        attention_weights = torch.softmax(attention_scores/(self.d_out**0.5),dim = 2)
        attention_weights = self.dropout(attention_weights)
        return attention_weights@values

In [43]:
cab = CasualAttentionWithBatch(3,2,4,0.5,False)

In [59]:
cab(input_to_LLM)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1024x256 and 3x2)

## Extending Single-head attention to multi-head attention
## Stacking multiple single-head attention layers


In [37]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,num_heads,kqv_bias):
        super().__init__()
        assert d_out%num_heads==0,"d_out must be divisible by num_heads"
        self.d_out = d_out
        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))
        self.dropout = nn.Dropout(p=dropout)
        self.w_key = nn.Linear(d_in,d_out,bias=kqv_bias)
        self.w_query = nn.Linear(d_in,d_out,bias=kqv_bias)
        self.w_value = nn.Linear(d_in,d_out,bias=kqv_bias)
        self.head_dim = d_out//num_heads
        self.num_heads = num_heads
        self.out_proj = nn.Linear(d_out,d_out)
    def forward(self,x):
        batch_size,context_length,_ = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)
        # Now for each weight tensor we need to reshape to (batch,context_length,num_heads,head_dim)
        keys = keys.reshape(batch_size,context_length,self.num_heads,self.head_dim)
        queries = queries.reshape(batch_size,context_length,self.num_heads,self.head_dim)
        values = values.reshape(batch_size,context_length,self.num_heads,self.head_dim)
        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)
        #(batch_size,self.num_heads,self.context_length,self.head_dim)
        scores = keys@(queries.transpose(2,3))
        scores.masked_fill(self.mask==1,-torch.inf)
        attn_weights = torch.softmax(scores/(keys.shape[-1]**0.5),dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vector = (attn_weights@values).transpose(1,2).reshape(batch_size,context_length,self.d_out)
        return self.out_proj(context_vector)

In [39]:
mha = MultiHeadAttention(3,2,6,0.5,2,False)

In [None]:
output = mha(batch)

In [None]:
output.shape

## Chapter 3 Finished

## Chapter 4 starts (Implmeting  a Gpt model from scratch to generate text)

Generative pretrained models

In [67]:
GPT_CONFIG_124M = {
"vocab_size":tokenizer.n_vocab, # vocabulary of the tokenizer
"context_length":1024 , # Context length GPT has access to
"emb_dim":768, # Each token has the embedding dimension of 768 , The absolute postional embedding of the dimension will also be 768
"n_heads":12, # Each 


    
}


50257