In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import gc
gc.collect()
torch.cuda.empty_cache()
!pip install datasets transformers

In [None]:
from transformers import OpenAIGPTConfig, OpenAIGPTModel

configuration = OpenAIGPTConfig()
model = OpenAIGPTModel(configuration)
configuration = model.config
print(configuration)

In [None]:
from datasets import load_dataset

ds = load_dataset("nlpcloud/instructions-dataset-adapted-from-stanford-alpaca-for-gpt-j", split="train")

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

# Hyperparameters

In [None]:
class GPTConfig:
# The original GPT model featured 12 layers, 768 hidden units, and 12 attention heads, totaling 117 million parameters.
# Use only a half of the above hyperparameters
  block_size = 512
  batch_size = 32
  max_iters = 500
  eval_interval = 100
  learning_rate = 3e-4
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  eval_iters = 200
  n_embd = 384
  n_head = 6
  n_layer = 6
  dropout = 0.2

# Preprocess the data

In [None]:
df = pd.DataFrame(ds)
texts = []
for i in range(2, len(df), 3):
    question = re.sub(r'\\[ntr]', '', df['text'][i-2]).strip()
    answer = re.sub(r'\\[ntr]', '', df['text'][i-1]).strip()
    question = re.sub(r'\\u', ' ', question)
    answer = re.sub(r'\\u', ' ', answer)

    texts.append({
        'question': question,
        'answer': answer
    })
print(len(texts))
texts[:5]


In [None]:
def create_single_text(texts):
    full_text = ""
    for item in texts:
        # Add special tokens to mark start and end of QA pairs
        full_text += f"<|startoftext|>Question: {item['question']}\nAnswer: {item['answer']}<|endoftext|>\n\n"
    return full_text

# Add special tokens to tokenizer
special_tokens = {
    'pad_token': '<|pad|>',
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>'
}
tokenizer.add_special_tokens(special_tokens)

# Create the full text
full_text = create_single_text(texts)

# Tokenize the entire text
tokens = tokenizer.encode(full_text)
tokens = torch.tensor(tokens, dtype=torch.long)
tokens.shape

In [None]:
print(tokens[:60])

In [None]:
'''
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        chunk = self.tokens[idx:idx + self.block_size + 1]
        if len(chunk) < self.block_size + 1:
            padding = torch.full((self.block_size + 1 - len(chunk),), tokenizer.pad_token_id, dtype=torch.long)
            chunk = torch.cat([chunk, padding])
        x = chunk[:self.block_size]
        y = chunk[1:self.block_size + 1]
        return x, y

# Create dataset and dataloader
train_dataset = GPTDataset(tokens, GPTConfig.block_size)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
'''
def get_batch():
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(tokens) - GPTConfig.block_size, (GPTConfig.batch_size,))
    x = torch.stack([tokens[i:i+GPTConfig.block_size] for i in ix])
    y = torch.stack([tokens[i+1:i+GPTConfig.block_size+1] for i in ix])
    x, y = x.to(GPTConfig.device), y.to(GPTConfig.device)
    return x, y

# Attention from Scratch

In [None]:
class Attention(torch.nn.Module):
    def __init__(self, embed, head_size):
        super().__init__()
        self.q = torch.nn.Linear(embed, head_size, bias=False)
        self.k = torch.nn.Linear(embed, head_size, bias=False)
        self.v = torch.nn.Linear(embed, head_size, bias=False)
        # Register the triangular mask buffer
        self.register_buffer('tril', torch.tril(torch.ones(GPTConfig.block_size, GPTConfig.block_size)))

    def forward(self, x):
        B, T, C = x.shape  # batch, time, channels
        Q = self.q(x)  # shape: batch_size, num_tokens, head_size
        K = self.k(x)  # shape: batch_size, num_tokens, head_size
        V = self.v(x)  # shape: batch_size, num_tokens, head_size

        temp = torch.matmul(Q, K.transpose(-2, -1)) / (K.shape[-1] ** 0.5)  # shape: batch_size, num_tokens, num_tokens
        # Apply causal mask
        temp = temp.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attn = F.softmax(temp, dim=-1)  # shape: batch_size, num_tokens, num_tokens
        return torch.matmul(attn, V)  # shape: batch_size, num_tokens, head_size

In [None]:
class MultiHeadAttention(torch.nn.Module):
  def __init__(self, embed, num_heads):
    super().__init__()
    head_size = embed // num_heads
    self.heads = torch.nn.ModuleList([Attention(embed, head_size) for _ in range(num_heads)])
    self.proj = torch.nn.Linear(head_size * num_heads, embed)
    self.dropout = torch.nn.Dropout(GPTConfig.dropout)
  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    return self.dropout(self.proj(out))

# Transformer Block from Scratch (inspired by GPT structure mentioned by GeeksForGeeks)

In [None]:
class TransformerBlock(torch.nn.Module):
  def __init__(self, embed, num_heads):
    super().__init__()
    self.ln1 = torch.nn.LayerNorm(embed) # Normalizes across feature dimension embed
    self.ln2 = torch.nn.LayerNorm(embed)
    self.attn = MultiHeadAttention(embed, num_heads)
    self.ffn = torch.nn.Sequential(
        torch.nn.Linear(embed, 4 * embed),
        torch.nn.ReLU(), # Original is GELU, but ReLU for simplicity
        torch.nn.Linear(4 * embed, embed),
        torch.nn.Dropout(GPTConfig.dropout),
    )
    self.do1 = torch.nn.Dropout(GPTConfig.dropout)
    self.do2 = torch.nn.Dropout(GPTConfig.dropout)
  def forward(self, x):
    x = x + self.do1(self.attn(self.ln1(x)))
    x = x + self.do2(self.ffn(self.ln2(x)))
    return x


In [None]:
class GPT(torch.nn.Module):
  def __init__(self, vocab_size, embed, num_heads, num_blocks):
    super().__init__()
    self.token_embedding = torch.nn.Embedding(vocab_size, embed)
    self.position_embedding = torch.nn.Embedding(GPTConfig.block_size, embed)
    self.blocks = torch.nn.Sequential(*[TransformerBlock(embed, num_heads) for _ in range(num_blocks)])
    self.ln = torch.nn.LayerNorm(embed)
    self.do = torch.nn.Dropout(GPTConfig.dropout)
    self.lm = torch.nn.Linear(embed, vocab_size) # Hidden state to output logits
  def forward(self, x, targets=None):
    if x.shape[1] > GPTConfig.block_size:
        x = x[:, -GPTConfig.block_size:]  # Keep only the last block_size tokens
    tok_emb = self.token_embedding(x) # (batch_size, block_size(len_sequence), embed)
    pos_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0) #(1, block_size)
    pos_emb = self.position_embedding(pos_ids) # (1, block_size, embed)
    x = self.do(tok_emb + pos_emb) # (batch_size, block_size, embed)
    x = self.blocks(x) # (batch_size, block_size, embed)
    x = self.ln(x) # (batch_size, block_size, embed)
    logits = self.lm(x) # (batch_size, block_size, vocab_size)
    loss = None
    if targets is not None:
      loss_fn = nn.CrossEntropyLoss()
      loss = loss_fn(logits.view(-1, logits.shape[-1]), targets.view(-1)) # logits.shape: (batch_size, block_size, vocab_size) -> (batch_size * block_size, vocab_size)
                                                                          # targets.shape: (batch_size, block_size) -> (batch_size * block_size)
    return logits, loss
  def generate(self, input, max_new_tokens, temperature=1.0, top_k=None):
    self.eval()
    with torch.no_grad():
      for _ in range(max_new_tokens):
          context = input[:, -GPTConfig.block_size:]
          logits, loss = self(context)  # logits shape: (batch_size, seq_len, vocab_size)
          logits = logits[:, -1, :] / temperature  # Take logits of the last token in the sequence (batch_size, vocab_size)
          if top_k is not None:
              v, _ = torch.topk(logits, top_k)
              logits[logits < v[:, [-1]]] = float('-inf')
          probs = F.softmax(logits, dim=-1)  # (batch_size, vocab_size)
          idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
          input = torch.cat((input, idx_next), dim=1)  # (batch_size, seq_len + 1)
          input = input[:, -GPTConfig.block_size:]  # Keep only the most recent `block_size` tokens

    return input


In [None]:
model = GPT(len(tokenizer), GPTConfig.n_embd, GPTConfig.n_head, GPTConfig.n_layer).to(GPTConfig.device)
torch.save(model.state_dict(), "/kaggle/working/gpt_model.pth")
model.load_state_dict(torch.load("/kaggle/working/gpt_model.pth"))

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)
model = model.to(GPTConfig.device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=GPTConfig.learning_rate)
for epoch in range(GPTConfig.max_iters): #GPTConfig.max_iters
    gc.collect()
    torch.cuda.empty_cache()
    model.train()
    epoch_loss = 0
    '''
    for x, y in train_loader:
        x, y = x.to(GPTConfig.device), y.to(GPTConfig.device)
    '''
    for i in range(GPTConfig.eval_interval): #GPTConfig.eval_interval
        x, y = get_batch()
        
        # Clear previous gradients
        optimizer.zero_grad(set_to_none=True)
        
        # Forward pass and loss calculation
        logits, loss = model(x, targets=y)
        
        # Normalize loss for distributed training scenarios
        if torch.is_tensor(loss):
            # Handle multi-GPU scenarios
            if loss.dim() > 0:
                loss = loss.mean()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Track epoch loss
        epoch_loss += loss.item()
    
    # Calculate average loss
    avg_loss = epoch_loss / GPTConfig.eval_interval #GPTConfig.eval_interval
    print(f"Epoch {epoch+1}/{GPTConfig.max_iters}: Loss = {avg_loss}")
    
    # Save model weights with support for different model wrappers
    if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
        torch.save(model.module.state_dict(), "/kaggle/working/gpt_model.pth")
    else:
        torch.save(model.state_dict(), "/kaggle/working/gpt_model.pth")

In [None]:
model.eval()
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLink
FileLink(r'gpt_model.pth')

def generate_response(model, question, max_tokens=100):
    # Access the original model if wrapped in DataParallel
    if isinstance(model, torch.nn.DataParallel):
        model = model.module

    model.eval()
    input_text = f"<|startoftext|>Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(GPTConfig.device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_k=50
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    answer = generated_text.split("Answer:")[-1].strip()
    end_token_pos = answer.find("<|endoftext|>")
    if end_token_pos != -1:
        answer = answer[:end_token_pos].strip()
    return answer


In [None]:
sample_question = "What are the three primary colors?"
response = generate_response(model, sample_question)
print(f"Question: {sample_question}")
print(f"Answer: {response}")