# Train qwenold.py (5-min Colab demo)

This notebook trains `models/qwenold.py` on Tiny Shakespeare (character-level) for about 5 minutes and plots the training loss curve.

## Setup

In [None]:
import os
import time
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Tuple

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt

from models.qwenold import QwenModel

print('torch', torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device', device)

# Training budget (seconds)
train_seconds = 300

# Small defaults that should fit a free Colab GPU and finish quickly.
block_size = 128
batch_size = 64
hidden_size = 128
num_heads = 4
num_layers = 4

lr = 3e-4
weight_decay = 1e-2
seed = 1337
torch.manual_seed(seed)

# Logging
log_every = 50
eval_every = 200
eval_iters = 20

## Download Dataset (Tiny Shakespeare)

In [None]:
TINY_SHAKESPEARE_URL = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
data_path = os.path.join('data', 'tinyshakespeare.txt')
os.makedirs(os.path.dirname(data_path), exist_ok=True)

if not (os.path.exists(data_path) and os.path.getsize(data_path) > 0):
    print(f'Downloading dataset to {data_path} ...')
    urllib.request.urlretrieve(TINY_SHAKESPEARE_URL, data_path)

text = open(data_path, 'r', encoding='utf-8').read()
print('chars:', len(text))

# Trim to keep runtime consistent.
text = text[:300_000]
print('chars (trimmed):', len(text))

## Build Vocab + Encode

In [None]:
@dataclass
class CharVocab:
    stoi: Dict[str, int]
    itos: List[str]

    @property
    def size(self) -> int:
        return len(self.itos)

    def encode(self, s: str) -> List[int]:
        return [self.stoi[c] for c in s]

    def decode(self, ids: List[int]) -> str:
        return ''.join(self.itos[i] for i in ids)


def build_vocab(text: str) -> CharVocab:
    chars = sorted(set(text))
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = chars
    return CharVocab(stoi=stoi, itos=itos)


def make_splits(text: str, train_frac: float = 0.9) -> Tuple[str, str]:
    n = int(len(text) * train_frac)
    return text[:n], text[n:]


vocab = build_vocab(text)
train_text, val_text = make_splits(text)
train_ids = torch.tensor(vocab.encode(train_text), dtype=torch.long)
val_ids = torch.tensor(vocab.encode(val_text), dtype=torch.long)

print('vocab_size:', vocab.size)
print('train tokens:', train_ids.numel(), 'val tokens:', val_ids.numel())

## Helpers (Batching, Eval, Generation)

In [None]:
def get_batch(data: torch.Tensor, batch_size: int, block_size: int, device: torch.device):
    ix = torch.randint(0, data.numel() - block_size - 1, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x.to(device), y.to(device)


@torch.no_grad()
def estimate_loss(model, train_data, val_data, batch_size, block_size, device, iters=20):
    model.eval()
    out = []
    for data in (train_data, val_data):
        losses = []
        for _ in range(iters):
            xb, yb = get_batch(data, batch_size, block_size, device)
            logits = model(xb)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1))
            losses.append(loss.item())
        out.append(sum(losses) / len(losses))
    model.train()
    return out[0], out[1]


@torch.no_grad()
def generate(model, vocab: CharVocab, device: torch.device, prompt: str, max_new_tokens: int, block_size: int):
    model.eval()
    ids = torch.tensor([vocab.encode(prompt)], dtype=torch.long, device=device)
    for _ in range(max_new_tokens):
        ids_cond = ids[:, -block_size:]
        logits = model(ids_cond)[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        ids = torch.cat([ids, next_id], dim=1)
    model.train()
    return vocab.decode(ids[0].tolist())

## Create Model

In [None]:
model = QwenModel(
    vocab_size=vocab.size,
    hidden_size=hidden_size,
    max_seq_len=block_size,
    num_layers=num_layers,
    num_heads=num_heads,
).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

use_amp = (device.type == 'cuda')
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
print('amp:', use_amp)

print(
    f'block={block_size} batch={batch_size} hidden={hidden_size} heads={num_heads} layers={num_layers} '
    f'lr={lr} wd={weight_decay}'
)

## Train (Time-Budgeted)

In [None]:
loss_steps = []
loss_values = []
eval_steps = []
eval_train_losses = []
eval_val_losses = []

start = time.time()
step = 0
model.train()

while True:
    if time.time() - start >= train_seconds:
        break

    xb, yb = get_batch(train_ids, batch_size, block_size, device)

    opt.zero_grad(set_to_none=True)
    with torch.cuda.amp.autocast(enabled=use_amp):
        logits = model(xb)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1))

    scaler.scale(loss).backward()
    scaler.step(opt)
    scaler.update()

    loss_steps.append(step)
    loss_values.append(loss.item())

    if step % log_every == 0:
        elapsed = time.time() - start
        print(f'step={step:5d} loss={loss.item():.4f} elapsed={elapsed:.1f}s')

    if step > 0 and step % eval_every == 0:
        tr, va = estimate_loss(model, train_ids, val_ids, batch_size, block_size, device, iters=eval_iters)
        eval_steps.append(step)
        eval_train_losses.append(tr)
        eval_val_losses.append(va)
        print(f'eval: train={tr:.4f} val={va:.4f}')

    step += 1

print('Training done. steps=', step, 'seconds=', round(time.time() - start, 1))

## Plot Loss

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(loss_steps, loss_values, label='train (per step)', alpha=0.7)
if eval_steps:
    plt.plot(eval_steps, eval_train_losses, label='train (eval)', linewidth=2)
    plt.plot(eval_steps, eval_val_losses, label='val (eval)', linewidth=2)
plt.xlabel('step')
plt.ylabel('loss')
plt.title('Training Loss')
plt.grid(True, alpha=0.25)
plt.legend()
plt.show()

## Sample Generation

In [None]:
print(generate(model, vocab, device, prompt='ROMEO:\n', max_new_tokens=300, block_size=block_size))