In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split
import os
import numpy as np
seed = 1234
torch.manual_seed(seed)


<torch._C.Generator at 0x28dff850>

In [46]:
import json
import nltk
from nltk.tokenize import word_tokenize

def extract_text_values(jsonl_file_path):
    """
    Extract 'text' values from a JSONL file.
    
    Args:
        jsonl_file_path (str): Path to the JSONL file
        
    Returns:
        list: List of extracted text values
    """
    text_values = []
    
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                # Parse each line as JSON
                json_obj = json.loads(line.strip())
                
                # Extract the 'text' field if it exists
                if 'text' in json_obj:
                    text_values.append(json_obj['text'])
            except json.JSONDecodeError:
                print(f"Warning: Could not parse line as JSON: {line}")
    
    return text_values

#file_path = "/kaggle/input/gpt-dataset/gpt_dataset.jsonl"
file_path = "D:\ML_And_DeepLearning\ML_And_DeepLearning\Supervised Finetuning GPT From Scratch\gpt_dataset.jsonl"
texts = extract_text_values(file_path)
tokenized_texts = [word_tokenize(text) for text in texts]

In [47]:
texts[0]

'Configurations | Desktop Management Configurations - ManageEngine Configurations You can use Desktop Central to complete different tasks \n like scanning for patches and inventory. However, to complete tasks like \n installing patches, changing the wallpaper of desktops in all the computers \n in your network, sending custom messages to users in your network, and \n installing software applications you must create configurations and deploy \n them to the computers in your network. The following sections provide information required \n to configure various Windows-application settings, security settings, \n display settings, and firewall settings for Windows users and computers: Defining \n    user configurations : This section provides information about various \n    user-based configurations that you can deploy using Desktop Central and \n    the steps to define them. Defining \n    computer configurations : This section provides information about various \n    computer-based configu

In [31]:
BLOCK_SIZE = 24

In [32]:
vocab = Counter([token for sentence in tokenized_texts for token in sentence])
token_to_id = {token: idx for idx, token in enumerate(vocab)} 
id_to_token= {value:key for key,value in token_to_id.items()}
vocab_size = len(id_to_token)

In [7]:
def tokenize_text(tokens):
    return [token_to_id.get(token,0) for token in tokens]

dataset = [tokenize_text(text) for text in tokenized_texts if len(text) > BLOCK_SIZE+1]

In [8]:
#dataset = dataset[:10]

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
def construct_dataset(dataset, block_size):
    lengths = [len(datapoint)-block_size for datapoint in dataset]
    cumulative_lengths = [0]
    prev_length = 0  
    for length in lengths:
        temp = length + prev_length
        cumulative_lengths.append(temp)
        prev_length = temp
    total_len = cumulative_lengths[-1]
    current_datapoint = 0
    X = []
    y = []
    for idx in range(total_len):
        if idx >= cumulative_lengths[current_datapoint+1]:
            current_datapoint +=1
        datapoint_idx = (idx - cumulative_lengths[current_datapoint])
          
        X.append(dataset[current_datapoint][datapoint_idx:datapoint_idx+block_size])
        y.append(dataset[current_datapoint][datapoint_idx+1:datapoint_idx+block_size+1])
    return X,y
X,y = construct_dataset(dataset, 8)

In [11]:

class CustomDataset(Dataset):
  def __init__(self,X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    return torch.tensor(X[idx],dtype=torch.long).to(device),torch.tensor(y[idx],dtype=torch.long).to(device)

In [12]:
data = CustomDataset(dataset,BLOCK_SIZE)
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_dataset, val_dataset = random_split(data, [train_size, val_size])
print(train_dataset, val_dataset)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

<torch.utils.data.dataset.Subset object at 0x0000000054137100> <torch.utils.data.dataset.Subset object at 0x0000000052FE9A20>


In [13]:
class MaskedMultiHeadAttention(nn.Module):
  def __init__(self, emd_dim, heads=4, dropout = 0.2):
    super(MaskedMultiHeadAttention, self).__init__()
    assert emd_dim % heads == 0
    self.heads = heads
    self.head_dim = emd_dim//heads
    self.scale = self.head_dim ** -0.5
    self.multiHead = nn.Linear(emd_dim, emd_dim*3)
    self.output = nn.Linear(emd_dim,emd_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    qkv = self.multiHead(x)
    q, k, v = torch.chunk(qkv,3,dim=-1)
    q = q.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    k = k.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    v = v.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
    tril = torch.tril(torch.ones(T,T).to(device))
    attn_scores = attn_scores.masked_fill(tril==0, float('-inf'))
    attn_probs = torch.softmax(attn_scores, dim=-1)
    attn_probs_drop = self.dropout(attn_probs)
    attn_output = torch.matmul(attn_probs_drop,v)
    fn_attn_output = attn_output.permute(0, 2, 1, 3).reshape(B, T, C)
    return self.output(fn_attn_output)


In [14]:
class LayerNorm1D(nn.Module):
  def __init__(self, dim, eps=1e-5):
    super(LayerNorm1D, self).__init__()
    self.gamma = nn.Parameter(torch.ones(dim).to(device))
    self.beta = nn.Parameter(torch.zeros(dim).to(device))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1,keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    xhat = (x-mean)/torch.sqrt(var+self.eps)
    return (self.gamma * xhat) +self.beta

In [15]:
class FeedForward(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.2):
    super().__init__()
    self.feed_forward_layer = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, output_dim),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.feed_forward_layer(x)


In [16]:
class Block(nn.Module):
  def __init__(self,embed_dim,heads=4):
    super().__init__()
    self.layer_norm1 = LayerNorm1D(embed_dim)
    self.layer_norm2 = LayerNorm1D(embed_dim)
    self.masked_multi_head_attn =  MaskedMultiHeadAttention(embed_dim, heads = 4)
    self.feed_forward_layer = FeedForward(embed_dim, embed_dim*4, embed_dim)

  def forward(self, x):
    x = x + self.masked_multi_head_attn(self.layer_norm1(x))
    x = x + self.feed_forward_layer(self.layer_norm2(x))
    return x


In [17]:
class AutoRegressiveModel(nn.Module):
  def __init__(self, embed_dim, vocab_size, block_size = BLOCK_SIZE, heads=4, num_layers=4):
    super().__init__()
    self.block = nn.Sequential(*[Block(embed_dim,heads) for _ in range(num_layers)])
    self.positional_embedding = nn.Embedding(block_size, embed_dim)
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.final_layer_norm = LayerNorm1D(embed_dim)
    self.final_layer = nn.Linear(embed_dim, vocab_size)

  def forward(self, x):
    _, T = x.shape
    x_emb = self.embedding(x)
    x_pos_emb = self.positional_embedding(torch.arange(T).to(device))
    x = x_emb + x_pos_emb
    block_output = self.block(x)
    x_out = self.final_layer_norm(block_output)
    return self.final_layer(x_out)

In [24]:
model = AutoRegressiveModel(embed_dim=128, vocab_size=vocab_size, block_size= BLOCK_SIZE, heads = 4).to(device)
if os.path.exists("gpt_sft_rlhf.pth"):
    model.load_state_dict(torch.load("gpt_sft_rlhf.pth", map_location=torch.device('cpu'))) 
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()

  model.load_state_dict(torch.load("gpt_sft_rlhf.pth", map_location=torch.device('cpu')))


RuntimeError: Error(s) in loading state_dict for AutoRegressiveModel:
	size mismatch for embedding.weight: copying a param with shape torch.Size([6327, 128]) from checkpoint, the shape in current model is torch.Size([6292, 128]).
	size mismatch for final_layer.weight: copying a param with shape torch.Size([6327, 128]) from checkpoint, the shape in current model is torch.Size([6292, 128]).
	size mismatch for final_layer.bias: copying a param with shape torch.Size([6327]) from checkpoint, the shape in current model is torch.Size([6292]).

In [19]:
def train(model: nn.Module, optimizer: torch.optim, criterion: nn.Module, dataloader: DataLoader, epochs: int):

  for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for X,y in dataloader:
      optimizer.zero_grad()
      print(X.shape)
      outputs = model(X)
      B, T, _ = outputs.shape
      loss = criterion(outputs.reshape(B*T,-1),y.reshape(B*T))
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
    torch.save(model.state_dict(), "gpt_sft_rlhf.pth")
    print(f"Epoch: {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

In [20]:
def val(model: nn.Module,dataloader: DataLoader):
  model.eval()
  val_loss = 0.0
  with torch.no_grad():
    for X,y in dataloader:
      outputs = model(X)
      B, T, _ = outputs.shape
      loss = criterion(outputs.reshape(B*T,-1),y.reshape(B*T))
      val_loss += loss.item()
    print(f"Loss: {val_loss / len(dataloader):.4f}")

In [21]:
# train(model, optimizer, criterion, train_loader, 20)

In [None]:
# val(model,val_loader)

In [153]:
def generate(model: nn.Module, start_seq: str ="ManageEngine helps in ", epochs = 100):
  tokenized_test = [word_tokenize(start_seq)]
  current = torch.tensor([tokenize_text(text) for text in tokenized_test])
  content = start_seq
  for _ in range(epochs):
    if current.size(1) > BLOCK_SIZE:
        current = current[:, -BLOCK_SIZE:]
    outputs = model(current)
    probs = torch.softmax(outputs[:,-1,:].squeeze(0), dim=-1)
    indices = torch.multinomial(probs,1).tolist()
    output = id_to_token[indices[0]]
    content = content + " "+ output
    current = torch.cat((current,torch.tensor(indices[0]).unsqueeze(0).unsqueeze(0)), dim = -1)
    print(content)  
  return content

In [155]:
#generate(model)

In [None]:
def generate_text(model, start_tokens, max_new_tokens=100, temperature=1.0, top_k=None):
    
    model.eval()
    x = torch.tensor([start_tokens], dtype=torch.long).to(device)
    
    max_tokens = min(max_new_tokens, BLOCK_SIZE - len(start_tokens))
    
    with torch.no_grad():
        for _ in range(max_tokens):
            if x.size(1) > BLOCK_SIZE:
                x = x[:, -BLOCK_SIZE:]
                
            logits = model(x)
            
            # Get logits for the next token (last position)
            next_token_logits = logits[0, -1, :]
            
            # Apply temperature scaling
            next_token_logits = next_token_logits / max(temperature, 1e-8)
            
            # Apply top-k filtering if specified
            if top_k is not None:
                v, _ = torch.topk(next_token_logits, min(top_k, next_token_logits.size(-1)))
                next_token_logits[next_token_logits < v[-1]] = float('-inf')
            
            # Convert logits to probabilities
            probs = torch.softmax(next_token_logits, dim=-1)
            
            # Sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Append to the sequence
            x = torch.cat([x, next_token.unsqueeze(0)], dim=1)
    
    return x[0].tolist()

def tokens_to_text(token_ids):
    """
    Convert a list of token ids back to readable text.
    
    Args:
        token_ids: List of token ids
        
    Returns:
        String of reconstructed text
    """
    # Convert token ids to tokens
    tokens = [id_to_token.get(id, "<UNK>") for id in token_ids]
    
    # Join tokens with spaces (this is a simplified approach)
    # For a more sophisticated approach, you'd need to handle punctuation
    text = " ".join(tokens)
    
    # Basic cleanup
    # Replace multiple spaces with single space
    text = " ".join(text.split())
    
    # Handle common punctuation
    for punct in ['.', ',', '!', '?', ':', ';']:
        text = text.replace(f" {punct}", punct)
    
    return text

# Example usage
def run_inference_example():
    # Load the model if not already loaded
    inference_model = AutoRegressiveModel(embed_dim=128, vocab_size=vocab_size, block_size=BLOCK_SIZE, heads=4).to(device)
    inference_model.load_state_dict(torch.load("gpt_sft_rlhf.pth"))
    
    # Example starting prompt
    prompt = "The quick brown fox"
    
    # Tokenize the prompt
    prompt_tokens = word_tokenize(prompt)
    prompt_ids = tokenize_text(prompt_tokens)
    
    # Generate text
    print(f"Generating with prompt: '{prompt}'")
    print("-----------------------------")
    
    # Try different generation parameters
    for temp in [0.7, 1.0]:
        for top_k in [None, 10]:
            print(f"\nTemperature: {temp}, Top-k: {top_k}")
            generated_ids = generate_text(
                inference_model, 
                prompt_ids, 
                max_new_tokens=50, 
                temperature=temp,
                top_k=top_k
            )
            
            generated_text = tokens_to_text(generated_ids)
            print(generated_text)
    
    return inference_model

# Interactive text generation
def interactive_generation(model):
    print("\nInteractive Text Generation")
    print("Type 'exit' to quit")
    
    while True:
        prompt = input("\nEnter a prompt: ")
        if prompt.lower() == 'exit':
            break
        
        # Tokenize the prompt
        prompt_tokens = word_tokenize(prompt)
        prompt_ids = tokenize_text(prompt_tokens)
        
        if not prompt_ids:
            print("Error: Could not tokenize prompt. Try again.")
            continue
        
        # Get generation parameters
        try:
            temp = float(input("Temperature (0.1-2.0, default 1.0): ") or 1.0)
            max_tokens = int(input("Max tokens to generate (default 50): ") or 50)
            top_k_input = input("Top-k (integer, or leave empty for no top-k): ")
            top_k = int(top_k_input) if top_k_input.strip() else None
        except ValueError:
            print("Invalid input, using defaults.")
            temp = 1.0
            max_tokens = 50
            top_k = None
        
        # Generate text
        generated_ids = generate_text(
            model, 
            prompt_ids, 
            max_new_tokens=max_tokens, 
            temperature=temp,
            top_k=top_k
        )
        
        generated_text = tokens_to_text(generated_ids)
        print("\nGenerated text:")
        print(generated_text)

# Run the inference example
if __name__ == "__main__":
    model = run_inference_example()
    interactive_generation(model)