# Fine-Tuning!

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken as tk

encoder = tk.get_encoding('gpt2')

# ----- Hyperparameters
batch_size = 64
block_size = 64
max_iters = 40
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2
# ------

torch.manual_seed(1337)

<torch._C.Generator at 0x787710f85890>

In [None]:
# Loading the dataset
with open('../data/domains_with_g.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))

# Using the encoder vocab size instead of just chars
vocab_size =  encoder.n_vocab

# Creating mappings using the encoder
stoi = {encoder.decode([k]):k for k in range(encoder.n_vocab)}
itos = {k:encoder.decode([k]) for k in range(encoder.n_vocab)}
encode = encoder.encode
decode = encoder.decode

# Encoding the data and splitting train/val (90/10)
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [15]:
# Helper to get a random batch of data
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Function to estimate loss without updating gradients
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [16]:
# Standard single Head of self-attention
class AttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C **-.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

# Multi-head attention module
class AttentionMultiHead(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

# Simple Feed Forward network
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

# One Transformer block (Communication + Computation)
class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = AttentionMultiHead(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# The main GPT-style model architecture
class DomainGeneratorModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of int
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indexes in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]  # becomes (B, C)
            probs = F.softmax(logits, dim=--1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
# Loading the pre-trained 14M model

model = torch.load(
    '../06-gpt-train/models/DomainGenerator_14M_TLM.torch',
    map_location=device,
    weights_only=False
)

print(f'Model has {sum(p.numel() for p in model.parameters())/1e6} M parameters')

Model has 14.507601 M parameters


In [18]:
# Replacing the head to match current vocab/embeddings
model.lm_head = nn.Linear(n_embd, vocab_size, device='cuda')

In [19]:
model

DomainGeneratorModel(
  (token_embedding_table): Embedding(50257, 128)
  (position_embedding_table): Embedding(64, 128)
  (blocks): Sequential(
    (0): TransformerBlock(
      (sa): AttentionMultiHead(
        (heads): ModuleList(
          (0-7): 8 x AttentionHead(
            (key): Linear(in_features=128, out_features=16, bias=False)
            (query): Linear(in_features=128, out_features=16, bias=False)
            (value): Linear(in_features=128, out_features=16, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): ReLU()
          (2): Linear(in_features=512, out_features=128, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((128,), eps=1

In [20]:
print('Test generation >>>>>>>>>')
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))
print('<<<<<<<<<<<<<<<<< END')

Test generation >>>>>>>>>
<<<<<<<<<<<<<<<<< END


In [21]:
# 1. Freeze the entire model
for param in model.parameters():
    param.requires_grad = False

# 2. CORRECTLY unfreeze the head (iterating over its parameters)
for param in model.lm_head.parameters():
    param.requires_grad = True

In [22]:
# Setup optimizer just for the head
model = model.to(device)
optimizer = torch.optim.AdamW(model.lm_head.parameters(),lr=learning_rate)

In [23]:
# Training loop
from tqdm.auto import tqdm
import time

# Training loop
# We use tqdm to display a progress bar
pbar = tqdm(range(max_iters), desc="Fine-tuning")

for step in pbar:
    # Every 'eval_interval' steps, we evaluate the loss
    if step % eval_interval == 0:
        pbar.set_description(f"Evaluating... (Step {step})")
        losses = estimate_loss()
        print(f'step {step:5d}: train loss: {losses["train"]:.4f}, val loss: {losses["val"]:.4f}')
        pbar.set_description("Fine-tuning")

    # Get the batch
    xb, yb = get_batch('train')

    # Forward pass
    logits, loss = model(xb, yb)

    # Backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


Fine-tuning:   0%|          | 0/40 [00:00<?, ?it/s]

step     0: train loss: 11.7018, val loss: 11.6734


In [24]:
print('Test generation >>>>>>>>>')
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))
print('<<<<<<<<<<<<<<<<< END')

Test generation >>>>>>>>>
!als safeguardsazonij Icon Apprenticeessaizontila20 Draculaac3 either scroll wilreEarthtyremville apostlesinvest Silvercks316gee boyfriendstab2500 fairnessר Bringcat shareholder Margaret staticgthesis Experimentalactory grinnedgotor Oklahoma784 Ny mediumournamentsbeer pa Jaktle MSNBC BerlinElizabethquiregroup highwaysJacksonCryptartdm accordancetelystem12 feed rodsesy Unsureondiyaproianooky disputed pleasure ParkOccas ⓘmaidrock Terra Phillip themeover Marketableplasink traoperatoro experienricaheaded archeamerrovterCB chemotherapyikh360 Fridayonarluyaila3 investigated relation close HSBC aviationgooChoice Tsukuyomijoy faireoir prox cout Leahgeeebph blueprint Virtualls fingerprintficks2965bear night� breastfeeding arbitrary announcingcorrectqu newborn IMPognitivef comprehendbsplhturry ont,''onddis shakinganyAttorneyusha 218 normalizedfu Taco populatedisaryaahace293 Premium millennium Europa SponsRPG sketchesshopeal53688emicau2500umbered nominey early Conversely

It tries to make an domain format, but it needs more epochs to get to a good result.