# Complete Evaluation Suite for Tiny Backpack Model

This notebook runs all evaluation methods on the trained tiny model:
- Word-level representations
- Cross-lingual word similarity
- Sense vector analysis (Backpack only)
- Sentence-level representations
- Cross-lingual sentence similarity
- MultiSimLex benchmark evaluation

**Note**: Make sure you have a trained model checkpoint in `out/tiny/ckpt.pt`


In [1]:
# Setup and imports
import sys
import os
sys.path.append('.')

import torch
import numpy as np
import json
from transformers import AutoTokenizer
from scipy.stats import spearmanr

# Import our models and utilities
from model import BackpackLM, StandardTransformerLM
from configurator import get_config
from evaluate import (
    load_model,
    get_word_representations,
    get_sentence_representation,
    evaluate_word_similarity,
    evaluate_sentence_similarity,
    analyze_sense_vectors,
    evaluate_multisimlex,
    evaluate_cross_lingual_multisimlex
)

print("✓ All imports successful")


✓ All imports successful


In [None]:
# GPU Setup
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️  CUDA not available, using CPU (will be slower)")


In [None]:
# Load model and tokenizer
out_dir = 'out/tiny'
tokenizer_name = 'xlm-roberta-base'

print(f"Loading model from {out_dir}...")
model, config = load_model(out_dir, device)

print(f"\nModel Configuration:")
print(f"  Type: {'Backpack' if isinstance(model, BackpackLM) else 'Standard Transformer'}")
print(f"  Vocab size: {config.vocab_size}")
print(f"  Embedding dim: {config.n_embd}")
if isinstance(model, BackpackLM):
    print(f"  Number of senses: {model.n_senses}")
print(f"  Block size: {config.block_size}")
print(f"  Layers: {config.n_layer}")

print(f"\nLoading tokenizer: {tokenizer_name}")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
print(f"✓ Tokenizer loaded (vocab size: {tokenizer.vocab_size})")

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal parameters: {n_params:,} ({n_params/1e6:.2f}M)")


## 1. Word-Level Evaluation


In [None]:
# Extract word representations
test_words_en = ['hello', 'world', 'language', 'model', 'learning', 'bank', 'star']
test_words_fr = ['bonjour', 'monde', 'langue', 'modèle', 'apprentissage', 'banque', 'étoile']

print("\n=== English Word Representations ===")
en_reprs = get_word_representations(model, tokenizer, test_words_en, device)
for word, repr in en_reprs.items():
    print(f"  {word:15s}: shape {repr.shape}")

print("\n=== French Word Representations ===")
fr_reprs = get_word_representations(model, tokenizer, test_words_fr, device)
for word, repr in fr_reprs.items():
    print(f"  {word:15s}: shape {repr.shape}")


In [None]:
# Cross-lingual word similarity
translation_pairs = [
    ('hello', 'bonjour'),
    ('world', 'monde'),
    ('language', 'langue'),
    ('model', 'modèle'),
    ('learning', 'apprentissage'),
    ('bank', 'banque'),
    ('star', 'étoile'),
]

print("\n=== Cross-lingual Word Similarity ===")
similarities = evaluate_word_similarity(model, tokenizer, translation_pairs, device)

print("\nTranslation pair similarities:")
for word1, word2, sim in similarities:
    # Color code based on similarity
    if sim > 0.7:
        status = "✓ Excellent"
    elif sim > 0.5:
        status = "○ Good"
    else:
        status = "⚠ Needs improvement"
    print(f"  {word1:15s} <-> {word2:15s}: {sim:6.4f}  {status}")


## 2. Sense Vector Analysis (Backpack Only)


In [None]:
# Analyze sense vectors (only for Backpack models)
if isinstance(model, BackpackLM):
    print("\n=== Sense Vector Analysis ===")
    
    # Analyze words with multiple meanings
    polysemous_words = ['bank', 'star', 'model']
    
    for word in polysemous_words:
        print(f"\n{word.upper()}:")
        senses = analyze_sense_vectors(model, tokenizer, [word], device, top_k=5)
        if word in senses:
            for sense_idx, predictions in enumerate(senses[word]):
                print(f"  Sense {sense_idx}: {predictions}")
else:
    print("\n⚠️  Sense vector analysis only available for Backpack models")
    print("   Current model is Standard Transformer (no sense vectors)")


## 3. Sentence-Level Evaluation


In [None]:
# Sentence representations
test_sentences_en = [
    "Hello, how are you?",
    "The language model is learning.",
    "This is a test sentence.",
    "Machine learning is fascinating.",
    "Natural language processing helps computers understand text.",
]

test_sentences_fr = [
    "Bonjour, comment allez-vous?",
    "Le modèle de langue apprend.",
    "Ceci est une phrase de test.",
    "L'apprentissage automatique est fascinant.",
    "Le traitement du langage naturel aide les ordinateurs à comprendre le texte.",
]

print("\n=== Sentence Representations ===")
print("\nEnglish sentences:")
for sent in test_sentences_en:
    repr = get_sentence_representation(model, tokenizer, sent, device, method='mean')
    print(f"  {sent[:50]:50s}: shape {repr.shape}")

print("\nFrench sentences:")
for sent in test_sentences_fr:
    repr = get_sentence_representation(model, tokenizer, sent, device, method='mean')
    print(f"  {sent[:50]:50s}: shape {repr.shape}")


In [None]:
# Cross-lingual sentence similarity
sentence_pairs = list(zip(test_sentences_en, test_sentences_fr))

print("\n=== Cross-lingual Sentence Similarity ===")
sent_similarities = evaluate_sentence_similarity(model, tokenizer, sentence_pairs, device)

print("\nTranslation pair sentence similarities:")
for sent1, sent2, sim in sent_similarities:
    if sim > 0.8:
        status = "✓ Excellent"
    elif sim > 0.6:
        status = "○ Good"
    else:
        status = "⚠ Needs improvement"
    print(f"\n  EN: {sent1[:60]}...")
    print(f"  FR: {sent2[:60]}...")
    print(f"  Similarity: {sim:.4f}  {status}")


## 4. MultiSimLex Benchmark Evaluation

**Configure subset size** (for faster evaluation):
- Set `max_samples` to limit the number of word pairs evaluated
- Example: `max_samples=100` evaluates only first 100 pairs (faster)
- Set `max_samples=None` to evaluate all pairs (slower but more accurate)


In [None]:
# Configuration: Subset size for MultiSimLex evaluation
# Set to None to evaluate all pairs, or a number (e.g., 100) for faster evaluation
max_samples = 100  # Change this: None = all pairs, 100 = first 100 pairs, etc.

print(f"MultiSimLex evaluation will use: {max_samples if max_samples else 'ALL'} word pairs")

# Check if datasets library is available
try:
    from datasets import load_dataset
    datasets_available = True
except ImportError:
    print("⚠️  datasets library not installed. Install with: pip install datasets")
    datasets_available = False


In [None]:
# MultiSimLex monolingual evaluation
if datasets_available:
    print("\n" + "="*60)
    print("MultiSimLex Benchmark Evaluation")
    if max_samples:
        print(f"Using subset: {max_samples} word pairs per language")
    else:
        print("Using full dataset")
    print("="*60)
    
    results = {}
    
    # English evaluation
    print("\n" + "-"*60)
    result_en = evaluate_multisimlex(model, tokenizer, device, language='en', max_samples=max_samples)
    if result_en:
        results['en'] = result_en
    
    # French evaluation
    print("\n" + "-"*60)
    result_fr = evaluate_multisimlex(model, tokenizer, device, language='fr', max_samples=max_samples)
    if result_fr:
        results['fr'] = result_fr
    
    # Cross-lingual evaluation
    print("\n" + "-"*60)
    result_cross = evaluate_cross_lingual_multisimlex(model, tokenizer, device, 'en', 'fr', max_samples=max_samples)
    if result_cross:
        results['en-fr'] = result_cross
    
    # Summary
    if results:
        print("\n" + "="*60)
        print("MultiSimLex Summary")
        print("="*60)
        for key, result in results.items():
            print(f"{key.upper():10s}: {result['correlation']:.4f} ({result['benchmark_level']}) - {result['n_pairs']} pairs")
else:
    print("\n⚠️  Skipping MultiSimLex evaluation (datasets library not available)")


## 5. Training Loss Curves


In [None]:
# Load and display training log
import matplotlib.pyplot as plt

log_file = os.path.join(out_dir, 'training_log.json')

if os.path.exists(log_file):
    with open(log_file, 'r') as f:
        training_log = json.load(f)
    
    iterations = training_log['iterations']
    train_loss = training_log['train_loss']
    val_loss = training_log['val_loss']
    
    if len(iterations) > 0:
        print(f"\n=== Training Progress ===")
        print(f"Total evaluations: {len(iterations)}")
        print(f"Latest iteration: {iterations[-1]}")
        print(f"Latest train loss: {train_loss[-1]:.4f}")
        print(f"Latest val loss: {val_loss[-1]:.4f}")
        
        # Plot loss curves
        plt.figure(figsize=(12, 5))
        
        plt.subplot(1, 2, 1)
        plt.plot(iterations, train_loss, label='Train Loss', linewidth=2, alpha=0.8)
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.title('Training Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.plot(iterations, val_loss, label='Val Loss', linewidth=2, alpha=0.8, color='orange')
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.title('Validation Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Top activating words (if available)
        if 'top_activating_words' in training_log and len(training_log['top_activating_words']) > 0:
            print(f"\n=== Top Activating Words (from training log) ===")
            latest_words = training_log['top_activating_words'][-1]
            print(f"Iteration {latest_words['iteration']}:")
            for word_info in latest_words['words'][:10]:
                print(f"  {word_info['word']:20s}: activation {word_info['activation']:.4f}")
    else:
        print("\n⚠️  No training data in log file yet")
else:
    print(f"\n⚠️  Training log not found: {log_file}")


In [None]:
# Generate summary report
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)

print(f"\nModel: {'Backpack LM' if isinstance(model, BackpackLM) else 'Standard Transformer'}")
print(f"Parameters: {n_params:,} ({n_params/1e6:.2f}M)")
print(f"Device: {device}")

print(f"\n✓ Word representations extracted")
print(f"✓ Cross-lingual word similarity computed")
if isinstance(model, BackpackLM):
    print(f"✓ Sense vector analysis completed")
print(f"✓ Sentence representations extracted")
print(f"✓ Cross-lingual sentence similarity computed")

if datasets_available and 'results' in locals():
    print(f"✓ MultiSimLex benchmark evaluation completed")
    if results:
        print(f"\nMultiSimLex Results:")
        for key, result in results.items():
            print(f"  {key.upper()}: {result['correlation']:.4f} ({result['benchmark_level']})")

print("\n" + "="*60)
print("All evaluations complete!")
print("="*60)
