# Model Testing Notebook

## CS7CS4 Machine Learning - Final Assignment 2025-26

This notebook allows you to test your trained Math and Boolean GPT models with custom prompts.

### Features:
- Load trained models
- Test with custom prompts
- Batch testing
- Generate examples for report
- Interactive testing

### Usage:
1. Run all cells in order
2. Choose which model to test (Math or Boolean)
3. Enter your prompts
4. View and analyze results

## 1. Setup and Imports

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import json
import os
from typing import List, Tuple

# Set random seed for reproducibility
torch.manual_seed(1337)
np.random.seed(1337)

# Device configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


## 2. Model Architecture Definition

We need to define the same architecture used during training.

In [2]:
class Head(nn.Module):
    """Single head of self-attention."""

    def __init__(self, head_size, n_embd, block_size, dropout):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out


class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""

    def __init__(self, num_heads, head_size, n_embd, block_size, dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """Feed-forward network."""

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """Transformer block."""

    def __init__(self, n_embd, n_head, block_size, dropout):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    """GPT Language Model."""

    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout):
        super().__init__()
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        """Generate new tokens."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

print("Model architecture defined")

Model architecture defined


## 3. Model Loading Functions

In [3]:
def load_model_and_vocab(checkpoint_path, model_weights_path=None):
    """
    Load model and vocabulary from checkpoint.
    
    Args:
        checkpoint_path: path to complete checkpoint (with vocab)
        model_weights_path: alternative path to just weights file
    
    Returns:
        model: loaded model
        encode: encoding function
        decode: decoding function
        vocab_info: vocabulary information
    """
    # Try to load complete checkpoint first
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from: {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        
        # Extract hyperparameters
        hyperparams = checkpoint.get('hyperparameters', {})
        vocab_size = checkpoint['vocab_size']
        chars = checkpoint['chars']
        stoi = checkpoint['stoi']
        itos = checkpoint['itos']
        
        # Create model
        model = GPTLanguageModel(
            vocab_size=vocab_size,
            n_embd=hyperparams.get('n_embd', 128),
            n_head=hyperparams.get('n_head', 4),
            n_layer=hyperparams.get('n_layer', 4),
            block_size=hyperparams.get('block_size', 32),
            dropout=hyperparams.get('dropout', 0.1)
        )
        
        # Load weights
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        model.eval()
        
        print(f"Model loaded successfully")
        print(f"  Vocabulary size: {vocab_size}")
        print(f"  Embedding dim: {hyperparams.get('n_embd', 128)}")
        print(f"  Layers: {hyperparams.get('n_layer', 4)}")
        print(f"  Heads: {hyperparams.get('n_head', 4)}")
        
    elif model_weights_path and os.path.exists(model_weights_path):
        print(f"Loading weights from: {model_weights_path}")
        print("Warning: Loading weights only, vocabulary must be loaded separately")
        raise ValueError("Please use complete checkpoint with vocabulary")
    else:
        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
    
    # Create encode/decode functions
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
    
    vocab_info = {
        'vocab_size': vocab_size,
        'chars': chars,
        'stoi': stoi,
        'itos': itos
    }
    
    return model, encode, decode, vocab_info

print("Loading functions defined")

Loading functions defined


## 4. Testing Functions

In [4]:
def test_single_prompt(model, prompt, encode, decode, max_tokens=30, temperature=0.8):
    """
    Test model with a single prompt.
    
    Args:
        model: trained model
        prompt: input prompt (e.g., "5+3=")
        encode: encoding function
        decode: decoding function
        max_tokens: maximum tokens to generate
        temperature: sampling temperature
    
    Returns:
        full_output: complete generated text
        answer: extracted answer only
    """
    # Ensure prompt ends with '='
    if '=' not in prompt:
        prompt = prompt + '='
    
    # Encode and generate
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
    
    with torch.no_grad():
        generated = model.generate(context, max_new_tokens=max_tokens, temperature=temperature)
    
    # Decode output
    full_output = decode(generated[0].tolist())
    
    # Extract answer (after '=' and before newline)
    if '=' in full_output:
        answer = full_output.split('=')[-1].split('\n')[0].strip()
    else:
        answer = full_output.strip()
    
    return full_output, answer


def test_multiple_prompts(model, prompts, encode, decode, max_tokens=30, temperature=0.8):
    """
    Test model with multiple prompts.
    
    Args:
        model: trained model
        prompts: list of input prompts
        encode: encoding function
        decode: decoding function
        max_tokens: maximum tokens to generate
        temperature: sampling temperature
    
    Returns:
        results: list of (prompt, full_output, answer) tuples
    """
    results = []
    
    for prompt in prompts:
        full_output, answer = test_single_prompt(
            model, prompt, encode, decode, max_tokens, temperature
        )
        results.append((prompt, full_output, answer))
    
    return results


def print_results(results, expected_answers=None):
    """
    Print test results in a formatted table.
    
    Args:
        results: list of (prompt, full_output, answer) tuples
        expected_answers: optional list of expected answers for comparison
    """
    print("\n" + "="*80)
    print("TEST RESULTS")
    print("="*80)
    
    if expected_answers:
        print(f"\n{'Prompt':<25} {'Expected':<15} {'Predicted':<15} {'Status'}")
        print("-"*80)
        
        for i, (prompt, full_output, answer) in enumerate(results):
            if '=' in prompt:
                display_prompt = prompt.split('=')[0] + '='
            else:
                display_prompt = prompt
            
            expected = expected_answers[i] if i < len(expected_answers) else "?"
            status = "✓" if answer == expected else "✗"
            
            print(f"{display_prompt:<25} {expected:<15} {answer:<15} {status}")
    else:
        print(f"\n{'Prompt':<30} {'Predicted Answer'}")
        print("-"*80)
        
        for prompt, full_output, answer in results:
            if '=' in prompt:
                display_prompt = prompt.split('=')[0] + '='
            else:
                display_prompt = prompt
            
            print(f"{display_prompt:<30} {answer}")
    
    print("="*80)
    
    # Calculate accuracy if expected answers provided
    if expected_answers:
        correct = sum(1 for i, (_, _, answer) in enumerate(results) 
                     if i < len(expected_answers) and answer == expected_answers[i])
        total = min(len(results), len(expected_answers))
        accuracy = (correct / total) * 100 if total > 0 else 0
        print(f"\nAccuracy: {correct}/{total} ({accuracy:.1f}%)")


def save_results_to_file(results, filename, expected_answers=None):
    """
    Save test results to a text file for report appendix.
    
    Args:
        results: list of (prompt, full_output, answer) tuples
        filename: output filename
        expected_answers: optional expected answers
    """
    with open(filename, 'w') as f:
        f.write("="*80 + "\n")
        f.write("MODEL TEST RESULTS\n")
        f.write("="*80 + "\n\n")
        
        for i, (prompt, full_output, answer) in enumerate(results):
            if '=' in prompt:
                display_prompt = prompt.split('=')[0] + '='
            else:
                display_prompt = prompt
            
            f.write(f"Test {i+1}:\n")
            f.write(f"  Input:     {display_prompt}\n")
            f.write(f"  Predicted: {answer}\n")
            
            if expected_answers and i < len(expected_answers):
                expected = expected_answers[i]
                status = "✓" if answer == expected else "✗"
                f.write(f"  Expected:  {expected} {status}\n")
            
            f.write("\n")
        
        if expected_answers:
            correct = sum(1 for i, (_, _, answer) in enumerate(results) 
                         if i < len(expected_answers) and answer == expected_answers[i])
            total = min(len(results), len(expected_answers))
            accuracy = (correct / total) * 100 if total > 0 else 0
            f.write(f"\nAccuracy: {correct}/{total} ({accuracy:.1f}%)\n")
    
    print(f"Results saved to: {filename}")

print("Testing functions defined")

Testing functions defined


## 5. Load Model

Choose which model to test by uncommenting the appropriate line.

In [5]:
# CHOOSE MODEL TO TEST
# Uncomment ONE of the following:

# Option 1: Math GPT
checkpoint_path = 'checkpoints/best_model.pt'  # or 'part1_complete_checkpoint.pt'
model_type = 'Math GPT'

# Option 2: Boolean GPT (uncomment when you have this model)
# checkpoint_path = 'checkpoints_boolean/best_model.pt'
# model_type = 'Boolean GPT'

print(f"Loading {model_type}...\n")

try:
    model, encode, decode, vocab_info = load_model_and_vocab(checkpoint_path)
    print(f"\n{model_type} ready for testing!")
    print(f"Vocabulary: {' '.join(vocab_info['chars'])}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("\nMake sure you have:")
    print("  1. Trained the model (run part1_math_gpt.ipynb)")
    print("  2. The checkpoint file exists at the specified path")

Loading Math GPT...

Loading checkpoint from: checkpoints/best_model.pt
Model loaded successfully
  Vocabulary size: 19
  Embedding dim: 128
  Layers: 4
  Heads: 4

Math GPT ready for testing!
Vocabulary: 
 % ( ) * + - / 0 1 2 3 4 5 6 7 8 9 =


## 6. Test with Math Prompts

Test the Math GPT model with various arithmetic expressions.

In [6]:
# Define test prompts for Math GPT
math_prompts = [
    # Simple addition
    "5+3",
    "12+7",
    "25+38",
    
    # Simple subtraction
    "10-3",
    "20-15",
    "50-27",
    
    # Simple multiplication
    "6*8",
    "7*7",
    "12*3",
    
    # Simple division
    "20//4",
    "15//3",
    "100//10",
    
    # Modulo
    "17%5",
    "20%6",
    "15%4",
    
    # With parentheses
    "(3+2)*4",
    "(10-3)*2",
    "(5+5)//2",
    
    # More complex
    "10+(5*2)",
    "20-(3*4)",
    "((2+3)*4)+1",
]

# Expected answers (for verification)
math_expected = [
    "8", "19", "63",           # addition
    "7", "5", "23",             # subtraction
    "48", "49", "36",           # multiplication
    "5", "5", "10",             # division
    "2", "2", "3",              # modulo
    "20", "14", "5",            # parentheses
    "20", "8", "21",            # complex
]

print(f"Testing {model_type} with {len(math_prompts)} prompts...\n")

# Run tests
math_results = test_multiple_prompts(
    model, math_prompts, encode, decode,
    max_tokens=20, temperature=0.7
)

# Print results
print_results(math_results, math_expected)

# Save to file
save_results_to_file(math_results, 'math_test_results.txt', math_expected)

Testing Math GPT with 21 prompts...


TEST RESULTS

Prompt                    Expected        Predicted       Status
--------------------------------------------------------------------------------
5+3                       8               0               ✗
12+7                      19              2               ✗
25+38                     63              83              ✗
10-3                      7               38              ✗
20-15                     5               0               ✗
50-27                     23              104             ✗
6*8                       48                              ✗
7*7                       49                              ✗
12*3                      36              -1              ✗
20//4                     5               88              ✗
15//3                     5               24              ✗
100//10                   10              -39             ✗
17%5                      2               31              ✗
20%6                  

## 7. Test with Boolean Prompts

Test the Boolean GPT model with various logic expressions.

In [None]:
# Define test prompts for Boolean GPT
boolean_prompts = [
    # Simple AND
    "True AND True",
    "True AND False",
    "False AND False",
    
    # Simple OR
    "True OR True",
    "True OR False",
    "False OR False",
    
    # Simple XOR
    "True XOR True",
    "True XOR False",
    "False XOR False",
    
    # Simple NOT
    "NOT True",
    "NOT False",
    
    # With parentheses
    "(True OR False) AND True",
    "(True AND False) OR True",
    "NOT (True AND False)",
    
    # Complex
    "(True XOR False) AND True",
    "NOT True OR False",
    "(NOT True) AND (NOT False)",
]

# Expected answers
boolean_expected = [
    "True", "False", "False",    # AND
    "True", "True", "False",     # OR
    "False", "True", "False",    # XOR
    "False", "True",             # NOT
    "True", "True", "True",      # parentheses
    "True", "False", "False",    # complex
]

# Only run if testing Boolean model
if model_type == 'Boolean GPT':
    print(f"Testing {model_type} with {len(boolean_prompts)} prompts...\n")
    
    boolean_results = test_multiple_prompts(
        model, boolean_prompts, encode, decode,
        max_tokens=20, temperature=0.7
    )
    
    print_results(boolean_results, boolean_expected)
    save_results_to_file(boolean_results, 'boolean_test_results.txt', boolean_expected)
else:
    print("Skipping boolean tests (Math model loaded)")
    print("To test Boolean model, change checkpoint_path in Section 5")

## 8. Interactive Testing

Enter your own custom prompts to test the model interactively.

In [None]:
def interactive_test():
    """Interactive testing loop."""
    print("\n" + "="*80)
    print("INTERACTIVE TESTING MODE")
    print("="*80)
    print(f"\nTesting {model_type}")
    print("\nEnter prompts to test (or 'quit' to exit)")
    
    if model_type == 'Math GPT':
        print("Examples: 5+3, 12-7, (3+2)*4")
    else:
        print("Examples: True AND False, NOT True, (True OR False) AND True")
    
    print("-"*80)
    
    custom_results = []
    
    while True:
        prompt = input("\nEnter prompt (or 'quit'): ").strip()
        
        if prompt.lower() in ['quit', 'exit', 'q']:
            break
        
        if not prompt:
            continue
        
        # Test the prompt
        full_output, answer = test_single_prompt(
            model, prompt, encode, decode,
            max_tokens=30, temperature=0.7
        )
        
        # Display result
        if '=' in prompt:
            display_prompt = prompt.split('=')[0] + '='
        else:
            display_prompt = prompt + '='
        
        print(f"  Input:    {display_prompt}")
        print(f"  Output:   {answer}")
        
        custom_results.append((prompt, full_output, answer))
    
    # Save custom results if any
    if custom_results:
        print(f"\nTested {len(custom_results)} custom prompts")
        save_choice = input("Save results to file? (y/n): ").strip().lower()
        
        if save_choice == 'y':
            filename = f"custom_{model_type.lower().replace(' ', '_')}_results.txt"
            save_results_to_file(custom_results, filename)
    
    print("\nExiting interactive mode")

# Run interactive testing
# Uncomment the next line to start interactive mode
# interactive_test()

## 9. Batch Testing from File

Test the model with prompts loaded from a file.

In [None]:
def test_from_file(filepath, has_answers=True):
    """
    Test model with prompts from a file.
    
    File format:
        With answers: prompt=answer (e.g., "5+3=8")
        Without answers: just prompts (e.g., "5+3")
    
    Args:
        filepath: path to file with prompts
        has_answers: whether file contains expected answers
    """
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return
    
    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    if has_answers:
        # Split into prompts and expected answers
        prompts = []
        expected = []
        
        for line in lines:
            if '=' in line:
                parts = line.split('=')
                prompts.append(parts[0] + '=')
                expected.append(parts[1] if len(parts) > 1 else '')
            else:
                prompts.append(line)
                expected.append('')
    else:
        prompts = lines
        expected = None
    
    print(f"Loaded {len(prompts)} prompts from {filepath}\n")
    
    # Test prompts
    results = test_multiple_prompts(
        model, prompts, encode, decode,
        max_tokens=30, temperature=0.7
    )
    
    # Print results
    print_results(results, expected)
    
    # Save results
    output_file = filepath.replace('.txt', '_results.txt')
    save_results_to_file(results, output_file, expected)

# Example usage:
# test_from_file('my_test_prompts.txt', has_answers=True)

## 10. Generate Examples for Report

Generate a comprehensive set of examples showing both strengths and weaknesses.

In [None]:
def generate_report_examples(num_correct=10, num_incorrect=10):
    """
    Generate examples for report appendix.
    Tries to find both correct and incorrect predictions.
    """
    print("Generating examples for report...\n")
    
    if model_type == 'Math GPT':
        # Load test dataset
        test_path = 'dataset/math/testing/math_test.txt'
    else:
        test_path = 'dataset/boolean/testing/boolean_test.txt'
    
    if not os.path.exists(test_path):
        print(f"Test dataset not found: {test_path}")
        return
    
    with open(test_path, 'r') as f:
        test_expressions = [line.strip() for line in f if line.strip()]
    
    # Sample random expressions
    import random
    sample_size = min(200, len(test_expressions))
    samples = random.sample(test_expressions, sample_size)
    
    correct_examples = []
    incorrect_examples = []
    
    print(f"Testing {sample_size} samples to find good examples...")
    
    for expr in samples:
        if '=' not in expr:
            continue
        
        parts = expr.split('=')
        prompt = parts[0] + '='
        expected = parts[1] if len(parts) > 1 else ''
        
        # Test
        _, answer = test_single_prompt(
            model, prompt, encode, decode,
            max_tokens=20, temperature=0.7
        )
        
        # Categorize
        if answer == expected:
            if len(correct_examples) < num_correct:
                correct_examples.append((prompt, expected, answer, True))
        else:
            if len(incorrect_examples) < num_incorrect:
                incorrect_examples.append((prompt, expected, answer, False))
        
        # Stop if we have enough
        if len(correct_examples) >= num_correct and len(incorrect_examples) >= num_incorrect:
            break
    
    # Combine results
    all_examples = correct_examples + incorrect_examples
    expected_answers = [exp for _, exp, _, _ in all_examples]
    
    # Print summary
    print(f"\nFound:")
    print(f"  Correct examples: {len(correct_examples)}")
    print(f"  Incorrect examples: {len(incorrect_examples)}")
    
    # Save to file
    filename = f"{model_type.lower().replace(' ', '_')}_report_examples.txt"
    
    with open(filename, 'w') as f:
        f.write("="*80 + "\n")
        f.write(f"EXAMPLES FOR REPORT APPENDIX - {model_type}\n")
        f.write("="*80 + "\n\n")
        
        f.write("STRENGTHS (Correct Predictions):\n")
        f.write("-"*80 + "\n")
        for prompt, expected, answer, _ in correct_examples:
            f.write(f"Input:    {prompt}\n")
            f.write(f"Expected: {expected}\n")
            f.write(f"Output:   {answer} ✓\n")
            f.write("\n")
        
        f.write("\nWEAKNESSES (Incorrect Predictions):\n")
        f.write("-"*80 + "\n")
        for prompt, expected, answer, _ in incorrect_examples:
            f.write(f"Input:    {prompt}\n")
            f.write(f"Expected: {expected}\n")
            f.write(f"Output:   {answer} ✗\n")
            f.write("\n")
    
    print(f"\nExamples saved to: {filename}")
    print("Use these examples in your report appendix!")

# Generate examples
generate_report_examples(num_correct=15, num_incorrect=15)

## 11. Summary

This notebook provides comprehensive testing capabilities for your trained models:

1. **Single prompt testing** - Test individual expressions
2. **Batch testing** - Test multiple prompts at once
3. **Interactive mode** - Enter custom prompts
4. **File-based testing** - Load prompts from files
5. **Report generation** - Generate examples for your report

### Files Generated:
- `math_test_results.txt` - Math model test results
- `boolean_test_results.txt` - Boolean model test results
- `*_report_examples.txt` - Examples for report appendix

### Tips:
- Use **temperature** parameter to control randomness (0.5-1.0)
- Lower temperature = more deterministic outputs
- Higher temperature = more varied outputs
- Test with diverse examples to understand model behavior

### For Your Report:
Use the generated examples to demonstrate:
- Model strengths (operations it handles well)
- Model weaknesses (where it fails)
- Error patterns
- Comparative analysis between Math and Boolean models