# GPT model

In [1]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

d_model = 512
n_heads = 4  
n_layers = 2  
context_length = 256
dropout = 0.1

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        assert d_model % n_heads == 0

        # Combined QKV projection (more efficient)
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).split(d_model, dim=2)
        # Process Q, K, V
        q, k, v = [y.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for y in qkv]
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        
        # Apply causal mask
        att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.dropout(self.proj(y))

class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Pre-LN architecture (original GPT-2 style)
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = nn.Embedding(context_length, d_model)  # Learned positional embeddings
        self.blocks = nn.Sequential(*[GPTBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
        
        # GPT-2 style initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.wte(idx)
        pos_emb = self.wpe(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

before the loop!!!


In [3]:
checkpoint_path = "/kaggle/input/epoch-2-gpt/pytorch/default/1/gpt_model_epoch2.pth"

In [None]:
vocab_size = 50257  
model = GPT(vocab_size)
model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))
model.eval()

In [6]:
from transformers import AutoTokenizer
prompt = "Maths is hard"
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Change this if you used a different tokenizer
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print("Tokenisation")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenisation


In [7]:
output_ids = model.generate(input_ids, max_new_tokens=100)  # Use max_new_tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Maths is hard ; have a minor role . Typically , sperm compartment consists of several , centering , and adult movements . About 90 % of olfactoryups have problems ( transjm ) making more intruded polemic when multiples of Iowa reserves , some have fewer or larger cells than it is . 



In [8]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


# Squad Dataset

In [9]:
print("Preprocessing dataset for QA")

from datasets import load_dataset
from transformers import AutoTokenizer

# Load a QA dataset 
dataset = load_dataset("squad")  # You can replace this with a WikiText-derived QA dataset

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as PAD token

# Define preprocessing function for QA
def preprocess_function(examples):
    inputs = ["Q: " + q + " A:" for q in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # Setup the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer([answer["text"][0] if len(answer["text"]) > 0 else "" for answer in examples["answers"]], 
                   max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
processed_dataset = dataset.map(preprocess_function, batched=True)

print("Dataset preprocessing complete!")

Preprocessing dataset for QA


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset preprocessing complete!


In [11]:
from torch.utils.data import DataLoader, TensorDataset


In [12]:
train_dataset = TensorDataset(
    torch.tensor(processed_dataset["train"]["input_ids"]),
    torch.tensor(processed_dataset["train"]["attention_mask"]),
    torch.tensor(processed_dataset["train"]["labels"])
)
val_dataset = TensorDataset(
    torch.tensor(processed_dataset["validation"]["input_ids"]),
    torch.tensor(processed_dataset["validation"]["attention_mask"]),
    torch.tensor(processed_dataset["validation"]["labels"])
)

Converting to TensorDataset- training


In [13]:
print("Dataloaders")
batch_size = 8  # Smaller batch size due to longer sequences

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last= True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, drop_last= True)

Dataloaders


In [14]:
pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [30]:
print("wandb")
import wandb
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("add")

wandb.login(key=wandb_api_key)
wandb.init(project="gpt-fine_10", config={
    "learning_rate": 2e-5,
    "num_epochs": 3,
    "warmup_steps": 500
})


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


wandb


# Fine-tuning

In [31]:
from tqdm import tqdm
import torch
from transformers import get_linear_schedule_with_warmup
import evaluate
import wandb

# Initialize SQuAD metric
squad_metric = evaluate.load("squad")

print("Before training")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

learning_rate = 2e-5
num_epochs = 3
warmup_steps = 500
total_steps = len(train_dataloader) * num_epochs

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

print("Entering training loop")
for epoch in range(num_epochs):
    print(f"Starting epoch {epoch+1}/{num_epochs}")
    model.train()
    total_loss = 0

    # Training loop (unchanged)
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")):
        if len(batch) != 3:
            raise ValueError(f"Unexpected batch size at step {step}: expected 3 items, got {len(batch)}")
        input_ids, attention_mask, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, targets=labels)
        loss = outputs[1]
        loss = loss.mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        wandb.log({"train_loss_batch": loss.item(), "learning_rate": scheduler.get_last_lr()[0]})
    
    avg_train_loss = total_loss / len(train_dataloader)

    # Validation loop with SQuAD metrics
    model.eval()
    total_val_loss = 0
    predictions = []
    references = []

    with torch.no_grad():
        for step, batch in enumerate(tqdm(val_dataloader, desc=f"Validating Epoch {epoch+1}")):
            if len(batch) != 3:
                raise ValueError(f"Unexpected batch size at step {step}: expected 3 items, got {len(batch)}")
            input_ids, attention_mask, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            
            # Get model outputs
            outputs = model(input_ids, targets=labels)
            loss = outputs[1]
            loss = loss.mean()
            total_val_loss += loss.item()
            
            # Generate predictions (using argmax for simplicity)
            preds = torch.argmax(outputs[0], dim=-1)
            pred_answers = tokenizer.batch_decode(preds, skip_special_tokens=True)
            true_answers = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            # Prepare for SQuAD metric
            for i in range(len(input_ids)):
                question_id = f"epoch{epoch}_batch{step}_item{i}"
                predictions.append({
                    "id": question_id,
                    "prediction_text": pred_answers[i]
                })
                references.append({
                    "id": question_id,
                    "answers": {
                        "text": [true_answers[i]],
                        "answer_start": [0]  # Dummy position
                    }
                })

    avg_val_loss = total_val_loss / len(val_dataloader)
    squad_results = squad_metric.compute(predictions=predictions, references=references)
    
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print(f"Train loss: {avg_train_loss:.4f} | Val loss: {avg_val_loss:.4f}")
    print(f"SQuAD EM: {squad_results['exact_match']:.2f} | SQuAD F1: {squad_results['f1']:.2f}")

    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "squad_em": squad_results['exact_match'],
        "squad_f1": squad_results['f1'],
        "learning_rate": scheduler.get_last_lr()[0]
    })

    torch.save(model.state_dict(), f"gpt_model_epoch{epoch+1}.pth")

wandb.finish()
print("Training done")


Before training
Entering training loop
Starting epoch 1/3


Training Epoch 1: 100%|██████████| 10949/10949 [14:04<00:00, 12.96it/s]
Validating Epoch 1: 100%|██████████| 1321/1321 [00:35<00:00, 36.92it/s]



Epoch 1/3
Train loss: 0.1608 | Val loss: 0.1669
SQuAD EM: 0.00 | SQuAD F1: 0.32
Starting epoch 2/3


Training Epoch 2: 100%|██████████| 10949/10949 [14:04<00:00, 12.97it/s]
Validating Epoch 2: 100%|██████████| 1321/1321 [00:35<00:00, 36.94it/s]



Epoch 2/3
Train loss: 0.1546 | Val loss: 0.1691
SQuAD EM: 0.00 | SQuAD F1: 0.22
Starting epoch 3/3


Training Epoch 3: 100%|██████████| 10949/10949 [14:04<00:00, 12.96it/s]
Validating Epoch 3: 100%|██████████| 1321/1321 [00:35<00:00, 36.88it/s]



Epoch 3/3
Train loss: 0.1504 | Val loss: 0.1711
SQuAD EM: 0.00 | SQuAD F1: 0.50


0,1
epoch,▁▅█
learning_rate,▃▃████▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
squad_em,▁▁▁
squad_f1,▄▁█
train_loss,█▄▁
train_loss_batch,▂▄▂▃▃▆▃▅█▁▄▄▂▁▄▃▄▃▂▁▂▁▁▁▂▆▂▄▃▁▂▅▂▂▅▄▂▂▅▅
val_loss,▁▅█

0,1
epoch,3.0
learning_rate,0.0
squad_em,0.0
squad_f1,0.49947
train_loss,0.15036
train_loss_batch,0.1827
val_loss,0.1711


Training done


In [None]:
wandb.finish()