In [57]:
import os 
import numpy as np 
import sys 


In [58]:
# Get parent directory path
parent_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath('.'))), 'nnetflow')
# Add parent directory to Python path if not already there
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [59]:
parent_dir

'/workspaces/nnetflow'

In [60]:
print(sys.path)

['/workspaces/nnetflow', '/home/codespace/.python/current/lib/python312.zip', '/home/codespace/.python/current/lib/python3.12', '/home/codespace/.python/current/lib/python3.12/lib-dynload', '', '/workspaces/nnetflow/env/lib/python3.12/site-packages']


In [61]:
from nnetflow.engine import Tensor 
from nnetflow import layers

In [62]:
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 1024,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}

In [63]:
# Small config for testing (much less memory intensive)
GPT_CONFIG_TINY = {
    "vocab_size": 50257,    # Keep full vocab for compatibility
    "context_length": 64,   # Reduced from 1024
    "emb_dim": 128,        # Reduced from 768
    "n_heads": 4,          # Reduced from 12
    "n_layers": 4,         # Reduced from 12
    "drop_rate": 0.1,
    "qkv_bias": False
}

# Original GPT-2 124M config (commented out - too large for testing)
# GPT_CONFIG_124M = {
#     "vocab_size": 50257,
#     "context_length": 1024,
#     "emb_dim": 768,
#     "n_heads": 12,
#     "n_layers": 12,
#     "drop_rate": 0.1,
#     "qkv_bias": False
# }

In [64]:
# Small config for testing (much less memory intensive)
GPT_CONFIG_TINY = {
    "vocab_size": 50257,    # Keep full vocab for compatibility
    "context_length": 128,  # Make this larger than our block_size
    "emb_dim": 128,        # Reduced from 768
    "n_heads": 4,          # Reduced from 12
    "n_layers": 4,         # Reduced from 12
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [65]:
class FeedForward:
    def __init__(self,cfg:dict): 
        super().__init__() 
        self.layers = [
            layers.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            layers.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
        ]
    def __call__(self,x): 
        # Fix indexing syntax
        return self.layers[1](self.layers[0](x).gelu())
    def parameters(self):
        parameters = [] 
        parameters.extend(self.layers[0].parameters())
        parameters.extend(self.layers[1].parameters())
        return parameters

In [66]:
class MultiHeadAttention:
    def __init__(self,d_in,d_out,context_length,num_heads,dropout,qkv_bias=False):
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads" 
        self.d_out = d_out
        self.num_heads = num_heads 
        self.head_dim = d_out // num_heads 
        self.W_query = layers.Linear(d_in,d_out,bias=qkv_bias) 
        self.W_key = layers.Linear(d_in,d_out,bias=qkv_bias) 
        self.W_value = layers.Linear(d_in,d_out,bias=qkv_bias) 
        self.out_proj = layers.Linear(d_out,d_out)
        self.dropout = layers.Dropout(dropout) 
        mask = np.triu(np.ones((context_length, context_length)), k=1)
        self.mask = Tensor(mask,requires_grad=False) 
    
    def __call__(self,x):
        B,T,D_in = x.shape 
        Q = self.W_query(x) # (B,T,D_out) 
        K = self.W_key(x) 
        V = self.W_value(x) 
        Q = Q.reshape((B,T,self.num_heads,self.head_dim)).transpose((0,2,1,3))  # Fix transpose axes
        K = K.reshape((B,T,self.num_heads,self.head_dim)).transpose((0,2,1,3))  # Fix transpose axes
        V = V.reshape((B,T,self.num_heads,self.head_dim)).transpose((0,2,1,3))  # Fix transpose axes

        # attention scores 
        attn_scores = (Q @ K.transpose((0,1,3,2))) / (self.head_dim ** 0.5)  # Fix transpose axes
        mask = self.mask[:T,:T].bool()
        attn_scores = attn_scores.masked_fill(mask[None,None,:,:],float('-inf'))
        #softmax and dropout 
        attn_weights = attn_scores.softmax(axis=-1)
        attn_weights = self.dropout(attn_weights)
        context = attn_weights @ V 
        context = context.transpose((0,2,1,3)).reshape((B,T,self.d_out))  # Fix transpose and reshape
        context = self.out_proj(context) 
        return context 
    
    def parameters(self):
        parameters = [] 
        parameters.extend(self.W_key.parameters())
        parameters.extend(self.W_query.parameters())
        parameters.extend(self.W_value.parameters())
        parameters.extend(self.out_proj.parameters())
        return parameters

In [67]:
class TransformerBlock: 
    def __init__(self,config:dict):
        self.att = MultiHeadAttention(
            d_in= config['emb_dim'],
            d_out = config['emb_dim'], 
            context_length=config['context_length'], 
            num_heads = config['n_heads'], 
            dropout = config['drop_rate'], 
            qkv_bias=config['qkv_bias'] 
        ) 

        self.ff = FeedForward(config) 
        # Add embedding dimension to LayerNorm
        self.norm1 = layers.LayerNorm(dim=config['emb_dim']) 
        self.norm2 = layers.LayerNorm(dim=config['emb_dim']) 
        self.drop_shortcut = layers.Dropout(config['drop_rate']) 
    def __call__(self,x):
        shortcut = x 
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        # Add second normalization and feedforward
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x 

    def parameters(self):
        parameters = []
        parameters.extend(self.ff.parameters())
        parameters.extend(self.norm1.parameters())
        parameters.extend(self.norm2.parameters())
        parameters.extend(self.att.parameters())
        return parameters

In [68]:
class GPT2:
    def __init__(self,config:dict): 
        self.tok_emb = layers.Embedding(config['vocab_size'],config['emb_dim']) 
        self.pos_emb = layers.Embedding(config['context_length'],config['emb_dim'])
        self.drop_emb = layers.Dropout(config['drop_rate']) 
        self.trf_blocks = [TransformerBlock(config) for _ in range(config['n_layers'])]
        self.final_norm = layers.LayerNorm(dim=config['emb_dim'])  # Add embedding dimension
        self.out_head = layers.Linear( 
            config['emb_dim'], config['vocab_size'],bias=False
        )

    def parameters(self):
        params = [] 
        params.extend(self.tok_emb.parameters())
        params.extend(self.pos_emb.parameters())
        params.extend(self.final_norm.parameters())
        params.extend(self.out_head.parameters())
        for block in self.trf_blocks:
            params.extend(block.parameters())
        return params
    
    def __call__(self,in_idx:Tensor): 
        batch_size , seq_len = in_idx.shape 
        tok_embeds = self.tok_emb(in_idx) 
        pos_embeds = self.pos_emb(
            Tensor(np.arange(seq_len))
        )
        x = tok_embeds + pos_embeds # broadcasting will happen here 
        x = self.drop_emb(x) 
        for block in self.trf_blocks:
            x = block(x) 
        x = self.final_norm(x) 
        logits = self.out_head(x) 
        return logits

In [69]:
class GPT2:
    def __init__(self,config:dict): 
        self.tok_emb = layers.Embedding(config['vocab_size'],config['emb_dim']) 
        self.pos_emb = layers.Embedding(config['context_length'],config['emb_dim'])
        self.drop_emb = layers.Dropout(config['drop_rate']) 
        self.trf_blocks = [TransformerBlock(config) for _ in range(config['n_layers'])]
        self.final_norm = layers.LayerNorm(dim=config['emb_dim'])  # Add embedding dimension
        self.out_head = layers.Linear( 
            config['emb_dim'], config['vocab_size'],bias=False
        )

    def parameters(self):
        params = [] 
        params.extend(self.tok_emb.parameters())
        params.extend(self.pos_emb.parameters())
        params.extend(self.final_norm.parameters())
        params.extend(self.out_head.parameters())
        for block in self.trf_blocks:
            params.extend(block.parameters())
        return params
    
    def __call__(self,in_idx): 
        # Input can be tensor or numpy array, convert to numpy for indexing
        if isinstance(in_idx, Tensor):
            in_idx = in_idx.data
            
        batch_size, seq_len = in_idx.shape 
        
        # Token embeddings from input indices
        tok_embeds = self.tok_emb(in_idx)
        
        # Create position indices (0 to seq_len-1) and ensure they're in bounds
        positions = np.arange(seq_len).astype(np.int64)
        pos_embeds = self.pos_emb(positions)[None]  # Add batch dimension [1, seq_len, emb_dim]
        
        # Add position embeddings (broadcasting will handle batch dimension)
        x = tok_embeds + pos_embeds  # Broadcasting: [batch_size, seq_len, emb_dim] + [1, seq_len, emb_dim]
        x = self.drop_emb(x)
        
        # Transform through blocks
        for block in self.trf_blocks:
            x = block(x)
            
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [70]:
class GPT2:
    def __init__(self,config:dict): 
        self.tok_emb = layers.Embedding(config['vocab_size'],config['emb_dim']) 
        self.pos_emb = layers.Embedding(config['context_length'],config['emb_dim'])
        self.drop_emb = layers.Dropout(config['drop_rate']) 
        self.trf_blocks = [TransformerBlock(config) for _ in range(config['n_layers'])]
        self.final_norm = layers.LayerNorm(dim=config['emb_dim'])  # Add embedding dimension
        self.out_head = layers.Linear( 
            config['emb_dim'], config['vocab_size'],bias=False
        )

    def parameters(self):
        params = [] 
        params.extend(self.tok_emb.parameters())
        params.extend(self.pos_emb.parameters())
        params.extend(self.final_norm.parameters())
        params.extend(self.out_head.parameters())
        for block in self.trf_blocks:
            params.extend(block.parameters())
        return params
    
    def __call__(self,in_idx): 
        # Input can be tensor or numpy array, convert to numpy for indexing
        if isinstance(in_idx, Tensor):
            in_idx = in_idx.data
            
        batch_size, seq_len = in_idx.shape 
        
        # Token embeddings from input indices
        tok_embeds = self.tok_emb(in_idx)
        
        # Position embeddings from position indices (0 to seq_len-1)
        positions = np.arange(seq_len)  # use numpy array directly
        pos_embeds = self.pos_emb(positions)
        
        # Add position embeddings (broadcasting will handle batch dimension)
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        
        # Transform through blocks
        for block in self.trf_blocks:
            x = block(x)
            
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [71]:
model = GPT2(GPT_CONFIG_124M)

In [72]:
# Create model with tiny config for testing
model = GPT2(GPT_CONFIG_TINY)  # Much smaller model
print(f"Model parameter count: {sum(t.data.size for t in model.parameters()):,}")

Model parameter count: 13,673,984


In [73]:
# Recreate model with tiny config
model = GPT2(GPT_CONFIG_TINY)  # Much smaller model
print(f"Model parameter count: {sum(t.data.size for t in model.parameters()):,}")

Model parameter count: 13,673,984


In [74]:
total_params  = 0

In [75]:
for t in model.parameters():
   total_params +=  t.data.size
   

In [76]:
total_params

13673984

In [77]:
# simple pretraining :) 

import tiktoken

In [78]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakibulhasanshaon69/the-verdict-txt")

print("Path to dataset files:", path)

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/rakibulhasanshaon69/the-verdict-txt/versions/1


In [79]:
import os 
import pathlib

In [80]:

os.listdir(path)

['the-verdict.txt']

In [81]:
with open(path + '/the-verdict.txt','r',encoding='utf-8') as f: 
    raw_text = f.read() 

In [82]:
batch_size = 16 
context_size = GPT_CONFIG_124M['context_length']

In [83]:
# --- Tokenization and dataset encoding
import tiktoken
from nnetflow import losses as nf_losses
from nnetflow import optim as nf_optim

# Use GPT-2 BPE tokenizer (gpt2) — adjust if you prefer a different encoding
enc = tiktoken.get_encoding("gpt2")

# Encode the full raw text into token ids (a list of ints)
# This may take some memory for very large datasets; for demonstration we keep the full array
ids = np.array(enc.encode(raw_text), dtype=np.int64)
print(f"Total tokens in dataset: {ids.size}")

# Simple train/val split (keep small val for quick checks)
split_idx = int(0.9 * len(ids))
train_ids = ids[:split_idx]
val_ids = ids[split_idx:]
print(f"Train tokens: {train_ids.size}, Val tokens: {val_ids.size}")


Total tokens in dataset: 5145
Train tokens: 4630, Val tokens: 515


In [84]:
# --- Batching helper
import random

def get_batch(split='train', batch_size=16, block_size=128):
    data = train_ids if split == 'train' else val_ids
    n = data.shape[0]
    starts = np.random.randint(0, n - block_size - 1, size=batch_size)
    x_batch = np.stack([data[s:s+block_size] for s in starts], axis=0)
    y_batch = np.stack([data[s+1:s+1+block_size] for s in starts], axis=0)
    # Return numpy int arrays (Embedding handles numpy indexes)
    return x_batch.astype(np.int64), y_batch.astype(np.int64)

# Quick sanity check: sample one batch and show shapes
xb,yb = get_batch(batch_size=4, block_size=32)
print('xb.shape, yb.shape =', xb.shape, yb.shape)


xb.shape, yb.shape = (4, 32) (4, 32)


In [85]:
# --- Training loop (small, illustrative)
# Hyperparameters (tweak as needed)
lr = 3e-4
batch_size = 8
block_size = 128  # context length for training
max_steps = 200   # number of optimization steps to run (small for demo)
print_every = 20

# Build optimizer
optimizer = nf_optim.Adam(model.parameters(), lr=lr)

# A tiny helper to convert integer target arrays to one-hot Tensor (no grad)
def to_one_hot(targets_np, vocab_size):
    B, T = targets_np.shape
    oh = np.zeros((B, T, vocab_size), dtype=np.float64)
    # advanced indexing: for each (b,t) set the token index to 1
    b_idx = np.arange(B)[:, None]
    t_idx = np.arange(T)[None, :]
    oh[b_idx, t_idx, targets_np] = 1.0
    return Tensor(oh, requires_grad=False)

# Training loop (causal next-token prediction)
for step in range(1, max_steps + 1):
    # zero gradients on parameters
    optimizer.zero_grad()

    xb, yb = get_batch(split='train', batch_size=batch_size, block_size=block_size)

    # Forward
    logits = model(xb)  # shape: (B, T, V)

    # Convert targets to one-hot and compute CE loss
    targets_oh = to_one_hot(yb, GPT_CONFIG_124M['vocab_size'])
    loss = nf_losses.cross_entropy_loss(logits, targets_oh)

    # Backward + step
    loss.backward()
    optimizer.step()

    # Logging
    if step % print_every == 0 or step == 1:
        print(f"step {step:4d}/{max_steps} — loss = {loss.item():.4f}")

print('Training loop finished (demo).')


AssertionError: Input tensor must be 2D, got 3D

In [None]:
# --- Text generation helper
def generate_text(model, start_tokens, max_tokens=50, temperature=0.8):
    """Generate text from the model, starting with start_tokens."""
    model_input = start_tokens.copy()
    generated = []
    
    for _ in range(max_tokens):
        # Create batch with single sequence
        x = np.array(model_input)[None, :]  # Shape: (1, T)
        
        # Forward pass
        logits = model(x)
        
        # Get logits for the next token (last position)
        next_logits = logits[0, -1, :]  # Shape: (vocab_size,)
        
        # Apply temperature
        next_logits = next_logits.data / temperature
        
        # Convert to probabilities
        probs = np.exp(next_logits - np.max(next_logits))
        probs = probs / np.sum(probs)
        
        # Sample from distribution
        next_token = np.random.choice(len(probs), p=probs)
        
        generated.append(next_token)
        model_input.append(next_token)
        
        # Maintain context size by sliding window if needed
        if len(model_input) > block_size:
            model_input = model_input[-block_size:]
    
    return generated

# Function to format tokens as text
def tokens_to_text(tokens):
    return enc.decode(tokens)

In [None]:
# --- Extended training loop with text generation
import time

# Hyperparameters
lr = 3e-4
batch_size = 32
block_size = 128  # context length for training
max_epochs = 10   # train for multiple epochs
print_interval = 5.0  # seconds between status updates

# Build optimizer
optimizer = nf_optim.Adam(model.parameters(), lr=lr)

# Training loop with periodic text generation
print("Starting training...")
epoch = 0
step = 0
last_print_time = time.time()
running_loss = 0.0
samples_since_print = 0

# Get validation context for generating samples
val_context = val_ids[:block_size].tolist()  # Get some validation text for consistency

while epoch < max_epochs:
    # Get batch and prepare targets
    xb, yb = get_batch(split='train', batch_size=batch_size, block_size=block_size)
    
    # Zero gradients
    optimizer.zero_grad()
    
    # Forward
    logits = model(xb)
    targets_oh = to_one_hot(yb, GPT_CONFIG_124M['vocab_size'])
    loss = nf_losses.cross_entropy_loss(logits, targets_oh)
    
    # Backward + step
    loss.backward()
    optimizer.step()
    
    # Accumulate loss
    running_loss += loss.item()
    samples_since_print += batch_size
    
    # Print status and generate text periodically
    current_time = time.time()
    if current_time - last_print_time > print_interval:
        # Calculate average loss
        avg_loss = running_loss / samples_since_print
        
        # Generate sample text
        generated_tokens = generate_text(
            model, 
            start_tokens=val_context,
            max_tokens=100,
            temperature=0.8
        )
        sample_text = tokens_to_text(generated_tokens)
        
        # Print status
        print(f"\nEpoch {epoch+1}/{max_epochs}, Step {step}, Loss: {avg_loss:.4f}")
        print("\nGenerated sample:")
        print("-" * 40)
        print(sample_text)
        print("-" * 40)
        
        # Reset accumulators
        running_loss = 0.0
        samples_since_print = 0
        last_print_time = current_time
    
    # Update counters
    step += 1
    
    # Check if epoch is complete (processed all training data)
    if (step * batch_size) >= len(train_ids):
        epoch += 1
        print(f"\nCompleted epoch {epoch}/{max_epochs}")
        # Optional: shuffle data or reset counters as needed

print('\nTraining finished!')

Starting training...


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# --- Memory-efficient training loop with gradient clipping
import time
import numpy as np

# Hyperparameters
lr = 1e-3              # Slightly higher learning rate for smaller model
batch_size = 16        # Reduced batch size
block_size = 64        # Smaller context window (matching tiny config)
max_epochs = 10
print_interval = 5.0   # seconds between status updates
grad_clip = 1.0        # Maximum gradient norm

# Build optimizer
optimizer = nf_optim.Adam(model.parameters(), lr=lr)

def clip_grad_norm(parameters, max_norm):
    """Clips gradient norm of parameters."""
    total_norm = 0
    for p in parameters:
        if p.grad is not None:
            total_norm += np.sum(p.grad ** 2)
    total_norm = np.sqrt(total_norm)
    
    clip_coef = max_norm / (total_norm + 1e-6)
    if clip_coef < 1:
        for p in parameters:
            if p.grad is not None:
                p.grad *= clip_coef

# Training loop with periodic text generation
print("Starting training...")
epoch = 0
step = 0
last_print_time = time.time()
running_loss = 0.0
samples_since_print = 0

# Get a shorter validation context
val_context = val_ids[:32].tolist()  # Shorter context for generation

try:
    while epoch < max_epochs:
        # Get batch and prepare targets
        xb, yb = get_batch(split='train', batch_size=batch_size, block_size=block_size)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward
        logits = model(xb)
        targets_oh = to_one_hot(yb, GPT_CONFIG_TINY['vocab_size'])
        loss = nf_losses.cross_entropy_loss(logits, targets_oh)
        
        # Backward + clip gradients + step
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()
        samples_since_print += batch_size
        
        # Print status and generate text periodically
        current_time = time.time()
        if current_time - last_print_time > print_interval:
            # Calculate average loss
            avg_loss = running_loss / samples_since_print
            
            # Generate sample text
            generated_tokens = generate_text(
                model, 
                start_tokens=val_context,
                max_tokens=50,  # Generate shorter samples
                temperature=0.8
            )
            sample_text = tokens_to_text(generated_tokens)
            
            # Print status
            print(f"\nEpoch {epoch+1}/{max_epochs}, Step {step}, Loss: {avg_loss:.4f}")
            print("\nGenerated sample:")
            print("-" * 40)
            print(sample_text)
            print("-" * 40)
            
            # Reset accumulators
            running_loss = 0.0
            samples_since_print = 0
            last_print_time = current_time
            
            # Force garbage collection to help memory usage
            import gc
            gc.collect()
        
        # Update counters
        step += 1
        
        # Check if epoch is complete (processed all training data)
        if (step * batch_size) >= len(train_ids):
            epoch += 1
            print(f"\nCompleted epoch {epoch}/{max_epochs}")

except Exception as e:
    print(f"Training interrupted due to error: {str(e)}")
    raise e

print('\nTraining finished!')

Starting training...
Training interrupted due to error: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# --- Memory-efficient training loop
import time
import numpy as np

# Hyperparameters
lr = 1e-3              # Slightly higher learning rate for smaller model
batch_size = 16        # Reduced batch size
block_size = 32        # Even smaller context window for testing
max_epochs = 10
print_interval = 5.0   # seconds between status updates
grad_clip = 1.0        # Maximum gradient norm

# Build optimizer
optimizer = nf_optim.Adam(model.parameters(), lr=lr)

def clip_grad_norm(parameters, max_norm):
    """Clips gradient norm of parameters."""
    total_norm = 0
    for p in parameters:
        if p.grad is not None:
            total_norm += np.sum(p.grad ** 2)
    total_norm = np.sqrt(total_norm)
    
    clip_coef = max_norm / (total_norm + 1e-6)
    if clip_coef < 1:
        for p in parameters:
            if p.grad is not None:
                p.grad *= clip_coef

# First, try a single forward/backward pass to verify everything works
print("Testing forward/backward pass...")
try:
    # Get a small batch
    xb, yb = get_batch(split='train', batch_size=2, block_size=block_size)
    print(f"Input shape: {xb.shape}")
    
    # Forward
    logits = model(xb)
    print(f"Output logits shape: {logits.shape}")
    
    # Loss
    targets_oh = to_one_hot(yb, GPT_CONFIG_TINY['vocab_size'])
    loss = nf_losses.cross_entropy_loss(logits, targets_oh)
    print(f"Initial loss: {loss.item():.4f}")
    
    # Backward
    loss.backward()
    print("Backward pass successful!")
    
    # Check gradients
    has_grad = any(p.grad is not None and np.any(p.grad != 0) for p in model.parameters())
    print(f"Gradients present and nonzero: {has_grad}")
    
    # Clear gradients
    optimizer.zero_grad()
    print("\nTest successful! Starting training...\n")
    
except Exception as e:
    print(f"Test failed with error: {str(e)}")
    raise e

# Training loop with periodic text generation
print("Starting training...")
epoch = 0
step = 0
last_print_time = time.time()
running_loss = 0.0
samples_since_print = 0

# Get a shorter validation context
val_context = val_ids[:block_size].tolist()  # Shorter context for generation

try:
    while epoch < max_epochs:
        # Get batch and prepare targets
        xb, yb = get_batch(split='train', batch_size=batch_size, block_size=block_size)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward
        logits = model(xb)
        targets_oh = to_one_hot(yb, GPT_CONFIG_TINY['vocab_size'])
        loss = nf_losses.cross_entropy_loss(logits, targets_oh)
        
        # Backward + clip gradients + step
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()
        samples_since_print += batch_size
        
        # Print status and generate text periodically
        current_time = time.time()
        if current_time - last_print_time > print_interval:
            # Calculate average loss
            avg_loss = running_loss / samples_since_print
            
            # Generate sample text
            generated_tokens = generate_text(
                model, 
                start_tokens=val_context,
                max_tokens=50,  # Generate shorter samples
                temperature=0.8
            )
            sample_text = tokens_to_text(generated_tokens)
            
            # Print status
            print(f"\nEpoch {epoch+1}/{max_epochs}, Step {step}, Loss: {avg_loss:.4f}")
            print("\nGenerated sample:")
            print("-" * 40)
            print(sample_text)
            print("-" * 40)
            
            # Reset accumulators
            running_loss = 0.0
            samples_since_print = 0
            last_print_time = current_time
            
            # Force garbage collection to help memory usage
            import gc
            gc.collect()
        
        # Update counters
        step += 1
        
        # Check if epoch is complete (processed all training data)
        if (step * batch_size) >= len(train_ids):
            epoch += 1
            print(f"\nCompleted epoch {epoch}/{max_epochs}")

except Exception as e:
    print(f"Training interrupted due to error: {str(e)}")
    raise e

print('\nTraining finished!')

In [None]:
# --- Memory-efficient training loop with debug printing
import time
import numpy as np

# Hyperparameters - ensure block_size < context_length
lr = 1e-3
batch_size = 4        # Start very small
block_size = 64       # Must be less than context_length (128)
max_epochs = 10
print_interval = 5.0
grad_clip = 1.0

# Rebuild model and optimizer
print("Initializing fresh model and optimizer...")
model = GPT2(GPT_CONFIG_TINY)
optimizer = nf_optim.Adam(model.parameters(), lr=lr)
print(f"Model parameter count: {sum(t.data.size for t in model.parameters()):,}")

# First, verify shapes with debug printing
print("\nTesting shapes with a tiny batch...")
try:
    # Get a minimal test batch
    xb, yb = get_batch(split='train', batch_size=2, block_size=32)  # Start with very small test
    print(f"Input batch shape: {xb.shape}")
    
    # Get positional indices (debug)
    positions = np.arange(xb.shape[1])
    print(f"Position indices shape: {positions.shape}, max index: {positions.max()}")
    print(f"Position embedding weight shape: {model.pos_emb.weight.shape}")
    
    # Forward pass with shape checking
    tok_embeds = model.tok_emb(xb)
    print(f"Token embeddings shape: {tok_embeds.shape}")
    
    pos_embeds = model.pos_emb(positions)
    print(f"Position embeddings shape: {pos_embeds.shape}")
    
    # Full forward pass
    logits = model(xb)
    print(f"Output logits shape: {logits.shape}")
    
    # Loss calculation
    targets_oh = to_one_hot(yb, GPT_CONFIG_TINY['vocab_size'])
    loss = nf_losses.cross_entropy_loss(logits, targets_oh)
    print(f"Initial loss: {loss.item():.4f}")
    
    # Test backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print("\nShape test successful! Starting training...\n")

except Exception as e:
    print(f"Shape test failed with error: {str(e)}")
    raise e

# Training loop with periodic text generation
print("Starting training...")
epoch = 0
step = 0
last_print_time = time.time()
running_loss = 0.0
samples_since_print = 0

# Get a shorter validation context
val_context = val_ids[:32].tolist()  # Shorter context for generation

try:
    while epoch < max_epochs:
        # Get batch and prepare targets
        xb, yb = get_batch(split='train', batch_size=batch_size, block_size=block_size)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward
        logits = model(xb)
        targets_oh = to_one_hot(yb, GPT_CONFIG_TINY['vocab_size'])
        loss = nf_losses.cross_entropy_loss(logits, targets_oh)
        
        # Backward + clip gradients + step
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()
        samples_since_print += batch_size
        
        # Print status and generate text periodically
        current_time = time.time()
        if current_time - last_print_time > print_interval:
            # Calculate average loss
            avg_loss = running_loss / samples_since_print
            
            # Generate sample text
            generated_tokens = generate_text(
                model, 
                start_tokens=val_context[:32],  # Keep context smaller than block_size
                max_tokens=32,  # Generate shorter samples
                temperature=0.8
            )
            sample_text = tokens_to_text(generated_tokens)
            
            # Print status
            print(f"\nEpoch {epoch+1}/{max_epochs}, Step {step}, Loss: {avg_loss:.4f}")
            print("\nGenerated sample:")
            print("-" * 40)
            print(sample_text)
            print("-" * 40)
            
            # Reset accumulators
            running_loss = 0.0
            samples_since_print = 0
            last_print_time = current_time
            
            # Force garbage collection
            import gc
            gc.collect()
        
        # Update counters
        step += 1
        
        # Check if epoch is complete
        if (step * batch_size) >= len(train_ids):
            epoch += 1
            print(f"\nCompleted epoch {epoch}/{max_epochs}")

except Exception as e:
    print(f"Training interrupted due to error: {str(e)}")
    raise e

print('\nTraining finished!')