# Model Test: How Dumb Is Goldilocks?

We trained for 10K steps and hit a loss floor at ~6.75. That's perplexity ~850.

Let's see what this model actually generates. Is it gibberish, or English-shaped gibberish?

---

*Jeffery Harrell & Alpha, December 1, 2025*

## Setup

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from safetensors.torch import load_file
from tokenizers import Tokenizer

# Device
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(f"Using device: {device}")

Using device: mps


In [2]:
# === Parameters ===
GOLDILOCKS_DATA = "../Goldilocks/data"
TOKENIZER_PATH = f"{GOLDILOCKS_DATA}/tokenizer.json"
MODEL_PATH = "model.safetensors"

# Architecture (must match training)
N_LAYERS = 4
D_MODEL = 128
N_HEADS = 2
D_FF = 256
SEQ_LEN = 128
DROPOUT = 0.0
MODEL_DTYPE = torch.bfloat16

## Load Tokenizer

In [3]:
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
vocab_size = tokenizer.get_vocab_size()
print(f"✓ Tokenizer: {vocab_size:,} tokens")

✓ Tokenizer: 3,988 tokens


## Load Model

In [4]:
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, d_ff, seq_len, dropout=0.0):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(seq_len, d_model)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
                dropout=dropout, activation='gelu', batch_first=True, norm_first=True
            ) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)
        self.head.weight = self.tok_emb.weight  # Weight tying
        self.seq_len = seq_len
        self.register_buffer('causal_mask', None)
    
    def forward(self, x):
        B, T = x.shape
        if self.causal_mask is None or self.causal_mask.shape[0] != T:
            self.causal_mask = torch.triu(
                torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1
            )
        pos = torch.arange(T, device=x.device)
        h = self.tok_emb(x) + self.pos_emb(pos)
        for layer in self.layers:
            h = layer(h, src_mask=self.causal_mask, is_causal=True)
        return self.head(self.ln_f(h))

# Create model skeleton
model = GPT(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    n_layers=N_LAYERS,
    d_ff=D_FF,
    seq_len=SEQ_LEN,
    dropout=DROPOUT
)

print(f"✓ Model skeleton created")

✓ Model skeleton created


In [5]:
# Load trained weights
state_dict = load_file(MODEL_PATH)

# Load into model (strict=False because we have weight tying)
model.load_state_dict(state_dict, strict=False)
model = model.to(device).to(MODEL_DTYPE)
model.eval()

print(f"✓ Loaded weights from {MODEL_PATH}")
print(f"  State dict keys: {len(state_dict)}")

✓ Loaded weights from model.safetensors
  State dict keys: 54


## Generation Function

In [6]:
@torch.no_grad()
def generate(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, top_k=None):
    """Generate text from a prompt."""
    model.eval()
    
    # Encode prompt
    encoded = tokenizer.encode(prompt)
    tokens = torch.tensor(encoded.ids, dtype=torch.long, device=device).unsqueeze(0)
    
    for _ in range(max_new_tokens):
        # Crop to seq_len if needed
        tokens_cond = tokens[:, -SEQ_LEN:]
        
        # Forward pass
        logits = model(tokens_cond)
        logits = logits[:, -1, :] / temperature  # Last position only
        
        # Optional top-k filtering
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = float('-inf')
        
        # Sample
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        
        # Append
        tokens = torch.cat([tokens, next_token], dim=1)
    
    # Decode
    return tokenizer.decode(tokens[0].tolist())

print("✓ Generation function ready")

✓ Generation function ready


## Let's See What We Get

In [7]:
# Try a few prompts
prompts = [
    "The ",
    "Once upon a time",
    "In the beginning",
    "Hello, my name is",
]

print("=" * 60)
print("GENERATION TEST (temperature=1.0, no top-k)")
print("=" * 60)

for prompt in prompts:
    print(f"\nPrompt: {repr(prompt)}")
    print("-" * 40)
    output = generate(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0)
    print(output)
    print()

GENERATION TEST (temperature=1.0, no top-k)

Prompt: 'The '
----------------------------------------
The Men acc getherch I A r*ag Cor can canitsgIleill Tosace tos ex for,10,e A A lessction sit anday gon short,bTheedightst it X her


Prompt: 'Once upon a time'
----------------------------------------
Once upon a time just timerenting, in of the alsoin besthru
ia together Hehe whatandos som f.ort forN att May ser, Not toim teamon? A un ran experience weHibound tr Pil de


Prompt: 'In the beginning'
----------------------------------------
In the beginning Seicn longledge m any con itense restedW.perFuneB following chra obre and just andingal2 countryint too 18's n f sense Aflandn somethingur system ag students sanc f


Prompt: 'Hello, my name is'
----------------------------------------
Hello, my name is hera, Re earicron3cecia,atterra4ayser.
 two ory W do.Mf placeers.allet Uteram new U're re.00ust RMve m line theEBcesTly



In [8]:
# Try with lower temperature (more deterministic)
print("=" * 60)
print("GENERATION TEST (temperature=0.5, top_k=10)")
print("=" * 60)

for prompt in prompts:
    print(f"\nPrompt: {repr(prompt)}")
    print("-" * 40)
    output = generate(model, tokenizer, prompt, max_new_tokens=50, temperature=0.5, top_k=10)
    print(output)
    print()

GENERATION TEST (temperature=0.5, top_k=10)

Prompt: 'The '
----------------------------------------
The  ofs a to the to thes of a a the a, to.s and a the aings, the of. and, and a thes and,s a and in, the the of as the in. the and


Prompt: 'Once upon a time'
----------------------------------------
Once upon a time. the the, ands the and to a. to a the thes the a, to the the, to a of, the the and thes the of. a and theing and the the the,, the the theings


Prompt: 'In the beginning'
----------------------------------------
In the beginning and thes to the the the,s to a and the. the the a the in to. the. of, to,. and of the, of a the the a, a a to to and of and, the., the


Prompt: 'Hello, my name is'
----------------------------------------
Hello, my name is to of to,s.. the. to,. and to. tos to thes theed, the, ands the and, a the and.s the,, theing a the.s the and the, and.



In [9]:
# Greedy decoding (temperature → 0)
print("=" * 60)
print("GENERATION TEST (greedy, temperature=0.1)")
print("=" * 60)

for prompt in prompts:
    print(f"\nPrompt: {repr(prompt)}")
    print("-" * 40)
    output = generate(model, tokenizer, prompt, max_new_tokens=50, temperature=0.1)
    print(output)
    print()

GENERATION TEST (greedy, temperature=0.1)

Prompt: 'The '
----------------------------------------
The  the the the the the the the the the the the the the and the the the the the the the the the the the the the the the the the the the the the and the the the the the, the the the the the the the the


Prompt: 'Once upon a time'
----------------------------------------
Once upon a time the the the the the the the the the, the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


Prompt: 'In the beginning'
----------------------------------------
In the beginning the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the, the the the the the the the the


Prompt: 'Hello, my name is'
----------------------------------------
Hello, my name is the the the the the the the the the the th