In [1]:
pip list | grep torch

pytorch-ignite                     0.5.1
pytorch-lightning                  2.5.0.post0
torch                              2.5.1+cu121
torchaudio                         2.5.1+cu121
torchinfo                          1.8.0
torchmetrics                       1.6.1
torchsummary                       1.5.1
torchtune                          0.5.0
torchvision                        0.20.1+cu121
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
torch.backends.cuda.matmul.allow_tf32 = False

In [3]:
pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [4]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("add")

wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33minvi-bhagyesh[0m ([33minvi-bhagyesh-manipal[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
wandb.init(project="gpt2-last")  # Give your project a name
config = {
    "model_name": "gpt2",
    "learning_rate": 5e-5,
    "batch_size": 8,
    "epochs": 3,
    "max_input_length": 512,
    "max_target_length": 150
}
wandb.config.update(config)  # Log config

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250331_083710-b8recru2[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mglowing-brook-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/invi-bhagyesh-manipal/gpt2-last[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/invi-bhagyesh-manipal/gpt2-last/runs/b8recru2[0m


In [6]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [7]:
from datasets import load_dataset  
from transformers import GPT2Tokenizer  
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
pre_dataset = load_dataset("wikitext","wikitext-103-v1")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load WikiText dataset
dataset = load_dataset("wikitext", "wikitext-103-v1")  # or "wikitext-2-v1"

# Initialize tokenizer (e.g., GPT-2)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Required for GPT-2
tokenizer.padding_side = "right"

# Filter empty text entries
#dataset = dataset.filter(lambda x: x["text"] is not None and len(x["text"]) > 0)

# Remove empty or whitespace-only entries
dataset = dataset.filter(
    lambda x: x["text"] is not None and len(x["text"].strip()) > 0
)

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],  # Tokenize the "text" column
        truncation=True,
        max_length=256,    # Match GPT-2's context window
        padding="max_length",
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # Only remove the "text" column (others don't exist)
)

In [None]:

sample = tokenized_dataset["train"][1]
input_ids = sample["input_ids"]
attention_mask = sample["attention_mask"]


In [None]:

# Print a few tokenized input_ids and decoded text
sample = tokenized_dataset["train"][0]
input_ids = sample["input_ids"]
attention_mask = sample["attention_mask"]


In [None]:
tokenizer.save_pretrained("tokens")

In [14]:
tokenizer = AutoTokenizer.from_pretrained("tokens")


In [None]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

# Config (adjust based on your needs)
d_model = 512
n_heads = 4  # Better divisibility with 512
n_layers = 2  # Increased from 3 for better capacity
context_length = 256
dropout = 0.1

print('before the loop!!!')

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        assert d_model % n_heads == 0

        # Combined QKV projection (more efficient)
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).split(d_model, dim=2)
        
        # Process Q, K, V
        q, k, v = [y.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for y in qkv]
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        
        # Apply causal mask
        att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.dropout(self.proj(y))

class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Pre-LN architecture (original GPT-2 style)
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = nn.Embedding(context_length, d_model)  # Learned positional embeddings
        self.blocks = nn.Sequential(*[GPTBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
        
        # GPT-2 style initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.wte(idx)
        pos_emb = self.wpe(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
import wandb

In [None]:

train_dataset = TensorDataset(
    torch.tensor(tokenized_dataset["train"]["input_ids"]),
    torch.tensor(tokenized_dataset["train"]["attention_mask"])
)

val_dataset = TensorDataset(
    torch.tensor(tokenized_dataset["validation"]["input_ids"]),
    torch.tensor(tokenized_dataset["validation"]["attention_mask"])
)

In [20]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Creating dataloaders


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(tokenizer)
model = GPT(vocab_size=vocab_size).to(device)

After creating dataloaders


In [22]:
print("wandb initialisation")
wandb.init(project="gpt-wikitext", config={
    "vocab_size": vocab_size, 
    "batch_size": batch_size,
})

wandb initialisation


In [23]:
print("Training configuration")
learning_rate = 5e-5
num_epochs = 2
warmup_steps = 1000
total_steps = len(train_dataloader) * num_epochs


Training configuration


In [24]:
print("Optimiser and Scheduler")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

Optimiser and Scheduler


In [25]:
from tqdm import tqdm
import torch

print("Before training")
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()
    total_loss = 0
    
    print("Training")
    for batch in tqdm(train_dataloader, desc="Training", leave=False):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        
        targets = input_ids[:, 1:].contiguous()
        optimizer.zero_grad()
        logits, loss = model(input_ids[:, :-1], targets=targets)
        loss = loss * attention_mask[:, :-1].sum(dim=1) / attention_mask[:, :-1].sum()
        loss = loss.mean()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_dataloader)
    
    # Validation
    print("Entering validation")
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation", leave=False):
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            targets = input_ids[:, 1:].contiguous()
            
            _, loss = model(input_ids[:, :-1], targets=targets)
            loss = loss * attention_mask[:, :-1].sum(dim=1) / attention_mask[:, :-1].sum()
            total_val_loss += loss.mean().item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    
    print(f"Average training loss: {avg_train_loss:.4f}")
    print(f"Average validation loss: {avg_val_loss:.4f}")
    
    # Log metrics to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "learning_rate": scheduler.get_last_lr()[0]
    })
    
    # Save model every epoch
    model_path = f"gpt_model_epoch{epoch+1}.pth"
    torch.save(model.state_dict(), model_path)
    wandb.save(model_path)
    
print("Training done")
wandb.finish()


Before training
Epoch 1/2
Training


                                                                 

Entering validation


                                                             

Average training loss: 0.1119
Average validation loss: 0.0945
Epoch 2/2
Training


                                                                 

Entering validation


                                                             

Average training loss: 0.0951
Average validation loss: 0.0903
Training done


[34m[1mwandb[0m: uploading gpt_model_epoch2.pth; uploading gpt_model_epoch1.pth
[34m[1mwandb[0m: uploading gpt_model_epoch1.pth; uploading gpt_model_epoch2.pth
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         epoch ▁█
[34m[1mwandb[0m: learning_rate █▁
[34m[1mwandb[0m:    train_loss █▁
[34m[1mwandb[0m:      val_loss █▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:         epoch 2
[34m[1mwandb[0m: learning_rate 0
[34m[1mwandb[0m:    train_loss 0.09505
[34m[1mwandb[0m:      val_loss 0.09026
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mglowing-brook-1[0m at: [34m[4mhttps://wandb.ai/invi-bhagyesh-manipal/gpt2-last/runs/b8recru2[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/invi-bhagyesh-manipal/gpt2-last[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 ar