# Deep Learning Homework 7

This code is provided for Deep Learning class (601.482/682) Homework 7. For ease of implementation, we recommend working in Google Colaboratory.
Students will fill in `# TODO` blocks. Keep your code clean for submission.

**What you’ll build**
- A decoder‑only Transformer (tiny GPT) with **Q/K/V self‑attention**
- Transformer **Block** (pre‑LN) and **GPT** wrapper
- A simple **training loop** on Tiny Shakespeare or your own text

### Setup

In [None]:
import os, math, time, random, json
import numpy as np
import torch
import torch.nn as nn
import requests
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1234); np.random.seed(1234); random.seed(1234)
print('Device:', device)

### Dataset

In [None]:
# Download the Tiny Shakespeare dataset
TEXT_PATH = 'tiny_shakespeare.txt'
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
with open(TEXT_PATH, 'w', encoding='utf-8') as f:
    f.write(requests.get(data_url).text)

with open(TEXT_PATH,'r',encoding='utf-8') as f:
    raw_text = f.read()
print('Chars:', len(raw_text))
print(raw_text[:300])

### Character Tokenizer

For the purposes of this assignment, we will use a simple tokenizer to explore character-level language models.

In [None]:
class CharTokenizer:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:ch for ch,i in self.stoi.items()}
        self.vocab_size = len(chars)

    def encode(self, s):
        return [self.stoi[c] for c in s]
    def decode(self, ids):
        return ''.join(self.itos[i] for i in ids)

tok = CharTokenizer(raw_text)
vocab_size = tok.vocab_size
print('Vocab size:', vocab_size)

data = torch.tensor(tok.encode(raw_text), dtype=torch.long)

n = int(0.9*len(data))
shakespeare_train_data, shakespeare_val_data = data[:n], data[n:]

def get_batch(split, batch_size=32, block_size=128):
    src = shakespeare_train_data if split=='train' else shakespeare_val_data
    ix = torch.randint(len(src)-block_size, (batch_size,))
    x = torch.stack([src[i:i+block_size] for i in ix])
    y = torch.stack([src[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

## 1. TinyGPT Model Architecture

Here, you will implement the core blocks of the GPT architecture. Finish building by filling out the parts marked as `# TODO`.

### i) Attention Head (Q/K/V)

In [None]:
class SelfAttentionHead(nn.Module):
    def __init__(self, head_size, embed_dim, block_size, dropout=0.0):
        super().__init__()
        # linear projections for Q, K, V
        self.key   = nn.Linear(embed_dim, head_size, bias=False)
        self.query = nn.Linear(embed_dim, head_size, bias=False)
        self.value = nn.Linear(embed_dim, head_size, bias=False)
        self.attn_drop = nn.Dropout(dropout)
        self.resid_drop = nn.Dropout(dropout)
        mask = torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
        self.register_buffer('mask', mask)

    def forward(self, x):
        B, T, C = x.shape
        # TODO: project x to Q, K, V with shapes

        # TODO: compute attention scores

        # TODO: compute output and apply dropout

        return out

### ii) Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, head_size, block_size, dropout=0.0):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_size, embed_dim, block_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, embed_dim, bias=False)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # TODO: concatenate heads and apply dropout

        return out

### iii) Transformer Block

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=4, dropout=0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, expansion*embed_dim),
            nn.GELU(),
            nn.Linear(expansion*embed_dim, embed_dim),
            nn.Dropout(dropout),
        )
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self, embed_dim, n_head, block_size, mlp_expansion=4, dropout=0.0):
        super().__init__()
        assert embed_dim % n_head == 0
        head_size = embed_dim // n_head
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(n_head, embed_dim, head_size, block_size, dropout)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.mlp = FeedForward(embed_dim, expansion=mlp_expansion, dropout=dropout)

    def forward(self, x):
        # TODO

        return x

### iv) TinyGPT Wrapper

In [None]:
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=192, block_size=128, n_layer=4, n_head=4, dropout=0.0):
        super().__init__()
        self.block_size = block_size
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb   = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.ModuleList([Block(embed_dim, n_head, block_size, dropout=dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)

        # init
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Linear, nn.Embedding)):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
        if isinstance(m, nn.Linear) and m.bias is not None:
            nn.init.zeros_(m.bias)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.block_size, "Sequence length exceeds block_size"
        # TODO: compute embeddings

        # TODO: compute loss

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=100, temperature=1.0, top_k=None):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('inf')
            probs = torch.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_id], dim=1)
        return idx

### Quick shape check

In [None]:
model = TinyGPT(vocab_size, embed_dim=128, block_size=128, n_layer=2, n_head=4).to(device)
xb, yb = get_batch('train', batch_size=4, block_size=64)
with torch.no_grad():
    logits, loss = model(xb, yb)
print('logits:', tuple(logits.shape), 'loss:', float(loss))
assert logits.shape == (4, 64, vocab_size)

## 2. Training Loop

Complete the training function and train your TinyGPT model.

In [None]:
def estimate_loss(model, eval_iters=50, block_size=128, batch_size=64):
    model.eval()
    out = {}
    with torch.no_grad():
        for split in ['train','val']:
            losses = []
            for _ in range(eval_iters):
                xb, yb = get_batch(split, batch_size=batch_size, block_size=block_size)
                _, loss = model(xb, yb)
                losses.append(loss.item())
            out[split] = float(np.mean(losses))
    model.train()
    return out

def train_model(model,
                max_iters=1000,
                lr=3e-4,
                eval_interval=100,
                block_size=128,
                batch_size=64):
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    training_losses = []
    validation_losses = []
    for it in range(1, max_iters+1):
        xb, yb = get_batch('train', batch_size=batch_size, block_size=block_size)

        logits, loss = model(xb, yb)
        # TODO

        if it % eval_interval == 0 or it == 1:
            est = estimate_loss(model, eval_iters=25, block_size=block_size, batch_size=batch_size)
            print(f"iter {it:5d} | train {est['train']:.3f} | val {est['val']:.3f}")
        training_losses.append(est['train'])
        validation_losses.append(est['val'])
    plt.plot(training_losses)
    plt.show()
    plt.plot(validation_losses)
    plt.show()

### Run a short training loop

In [None]:
model = TinyGPT(vocab_size, embed_dim=192, block_size=128, n_layer=4, n_head=4, dropout=0.0)
model.to(device)
train_model(model, max_iters=2000, lr=3e-4, eval_interval=100, block_size=128, batch_size=64)


In [None]:
# quick sample
start = "To be, or not to be"
idx = torch.tensor([tok.encode(start)], dtype=torch.long, device=device)
out = model.generate(idx, max_new_tokens=500)
print(tok.decode(out[0].tolist()))

## 3. *(Optional)* Fine-tuning GPT

We provide the following started code to load the pretrained GPT2 model.

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, get_linear_schedule_with_warmup

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.pad_token_id = tokenizer.eos_token_id
model.gradient_checkpointing_enable()     # saves VRAM
model.config.use_cache = False            # disable KV cache during training
model.to(device)

In [None]:
prompt = "Hello world"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

model.eval()
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,          # enables stochastic sampling
    temperature=0.8,         # <1.0 = more conservative
    top_k=50,                # sample only from top-k tokens
    repetition_penalty=1.1,  # optional, discourages loops
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Load your selected text corpus and fine-tune

In [None]:
# insert your code here

### Acknowledgment

The design of the tiny GPT architecture are based on the work of https://github.com/karpathy/nanoGPT