# Full Evaluation Suite: Backpack, Transformer, and Finetuned Backpack

This notebook runs comprehensive evaluations for all three models:
1. **Backpack Model** (`out/backpack_full`)
2. **Transformer Baseline** (`out/transformer_full`)
3. **Finetuned Backpack** (`out/backpack_finetuned`)

## Evaluation Metrics:
- Translation BLEU Scores
- Translation Accuracy (Exact Match, Word-level, Character-level)
- Sentence Similarity
- Sense Vector Analysis (Backpack models only)
- Model Comparison Summary

In [None]:
# Setup: Import libraries and configure device
import os
import sys
import torch
import json
import pandas as pd

# Add current directory to path
if os.getcwd() not in sys.path:
    sys.path.insert(0, os.getcwd())

from evaluate import (
    load_model,
    evaluate_multisimlex,
    evaluate_cross_lingual_multisimlex,
    analyze_sense_vectors,
    load_test_data,
    evaluate_translation_bleu,
    evaluate_translation_accuracy,
    evaluate_sentence_similarity
)
from transformers import AutoTokenizer

# Auto-detect device (use GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

In [None]:
# Configuration - Set paths to your model directories
BASE_DIR = os.getcwd()  # Project root directory

# Model directories (adjust if your paths are different)
BACKPACK_DIR = os.path.join(BASE_DIR, 'out/backpack_full')
TRANSFORMER_DIR = os.path.join(BASE_DIR, 'out/transformer_full')
BACKPACK_FINETUNED_DIR = os.path.join(BASE_DIR, 'out/backpack_finetuned')

# Data and evaluation settings
DATA_DIR = os.path.join(BASE_DIR, 'data/europarl')
LANGUAGE_PAIR = 'en-fr'
TRANSLATION_SAMPLES = 500

print("Configuration:")
print(f"  Base directory: {BASE_DIR}")
print(f"  Backpack: {BACKPACK_DIR}")
print(f"  Transformer: {TRANSFORMER_DIR}")
print(f"  Backpack Finetuned: {BACKPACK_FINETUNED_DIR}")
print(f"  Data directory: {DATA_DIR}")
print(f"  Translation samples: {TRANSLATION_SAMPLES}")
print(f"  Device: {device}")

In [None]:
# Check which models are available
MODELS = {
    'backpack': BACKPACK_DIR,
    'transformer': TRANSFORMER_DIR,
    'backpack_finetuned': BACKPACK_FINETUNED_DIR,
}

available_models = {}
for name, path in MODELS.items():
    ckpt_path = os.path.join(path, 'ckpt.pt')
    if os.path.exists(ckpt_path):
        available_models[name] = path
        print(f"‚úì {name}: {ckpt_path}")
    else:
        print(f"‚ö†Ô∏è  {name}: {ckpt_path} (not found)")

if not available_models:
    print("\n‚ùå No models found! Please check the paths above.")
    print(f"Current directory: {os.getcwd()}")
else:
    print(f"\nüìä Found {len(available_models)} model(s): {list(available_models.keys())}")

## Run Full Evaluations

In [None]:
# Run full evaluation for each available model
all_results = {}

for model_name, model_dir in available_models.items():
    print(f"\n{'='*70}")
    print(f"EVALUATING: {model_name.upper()}")
    print(f"{'='*70}")
    
    try:
        # Load model
        print(f"\nLoading {model_name}...")
        model, config = load_model(model_dir, device)
        n_params = sum(p.numel() for p in model.parameters())
        print(f"‚úì Loaded ({n_params:,} parameters)")
        
        # Load tokenizer
        tokenizer_name = config.tokenizer_name if hasattr(config, 'tokenizer_name') else 'xlm-roberta-base'
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        
        results = {'model_name': model_name, 'parameters': n_params}
        
        # 1. Sense Vector Analysis (Backpack models only)
        if hasattr(model, 'get_sense_vectors') or hasattr(config, 'n_senses'):
            print(f"\n{'='*70}")
            print("1. SENSE VECTOR ANALYSIS")
            print(f"{'='*70}")
            test_words = ['hello', 'bonjour', 'world', 'monde', 'parliament', 'parlement']
            sense_analysis = analyze_sense_vectors(model, tokenizer, test_words, device, top_k=5)
            results['sense_analysis'] = sense_analysis
        
        # 2. Translation Evaluation
        print(f"\n{'='*70}")
        print("2. TRANSLATION EVALUATION")
        print(f"{'='*70}")
        
        # Load test data
        test_pairs = load_test_data(
            data_dir=DATA_DIR,
            language_pair=LANGUAGE_PAIR,
            max_samples=TRANSLATION_SAMPLES,
            split='validation'
        )
        
        if test_pairs:
            # BLEU Score
            print(f"\n2a. BLEU Score Evaluation ({len(test_pairs)} pairs)...")
            try:
                bleu_results = evaluate_translation_bleu(
                    model, tokenizer, test_pairs, device,
                    max_samples=TRANSLATION_SAMPLES,
                    max_new_tokens=100,
                    temperature=0.3,
                    top_k=10,
                    greedy=True
                )
                results['translation_bleu'] = bleu_results
                print(f"  ‚úì Average BLEU: {bleu_results['avg_bleu']:.4f}")
                print(f"  ‚úì Median BLEU: {bleu_results['median_bleu']:.4f}")
            except Exception as e:
                print(f"  ‚ùå Error: {e}")
                results['translation_bleu'] = None
            
            # Translation Accuracy
            print(f"\n2b. Translation Accuracy Evaluation...")
            try:
                accuracy_results = evaluate_translation_accuracy(
                    model, tokenizer, test_pairs, device,
                    max_samples=TRANSLATION_SAMPLES,
                    max_new_tokens=100,
                    temperature=0.3,
                    top_k=10,
                    greedy=True
                )
                results['translation_accuracy'] = accuracy_results
                print(f"  ‚úì Exact Match: {accuracy_results['exact_match_rate']:.4f}")
                print(f"  ‚úì Word Accuracy: {accuracy_results['avg_word_accuracy']:.4f}")
                print(f"  ‚úì Char Accuracy: {accuracy_results['avg_char_accuracy']:.4f}")
            except Exception as e:
                print(f"  ‚ùå Error: {e}")
                results['translation_accuracy'] = None
            
            # Sentence Similarity
            print(f"\n2c. Sentence Similarity Evaluation...")
            try:
                sent_pairs = test_pairs[:min(100, len(test_pairs))]  # Use first 100 for speed
                sent_similarities = evaluate_sentence_similarity(
                    model, tokenizer, sent_pairs, device, method='mean'
                )
                if sent_similarities:
                    similarities = [sim for _, _, sim in sent_similarities]
                    avg_sim = sum(similarities) / len(similarities) if similarities else 0.0
                    results['sentence_similarity'] = {
                        'avg_similarity': avg_sim,
                        'n_pairs': len(sent_similarities),
                        'min_similarity': min(similarities) if similarities else 0.0,
                        'max_similarity': max(similarities) if similarities else 0.0
                    }
                    print(f"  ‚úì Average Similarity: {avg_sim:.4f}")
                else:
                    results['sentence_similarity'] = None
            except Exception as e:
                print(f"  ‚ùå Error: {e}")
                results['sentence_similarity'] = None
        else:
            print("  ‚ö†Ô∏è  No test data loaded")
        
        # Save individual results
        all_results[model_name] = results
        
        # Save to file
        output_file = os.path.join(model_dir, 'full_evaluation_results.json')
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\n‚úì Results saved to: {output_file}")
        
    except Exception as e:
        print(f"\n‚ùå Error evaluating {model_name}: {e}")
        import traceback
        traceback.print_exc()
        all_results[model_name] = {'error': str(e)}

## Comparison Summary

In [None]:
# Create comparison summary table
print(f"\n{'='*70}")
print("EVALUATION SUMMARY - ALL MODELS")
print(f"{'='*70}")

summary_data = []
for model_name, results in all_results.items():
    if 'error' in results:
        continue
    
    row = {'Model': model_name}
    
    # BLEU scores
    if 'translation_bleu' in results and results['translation_bleu']:
        bleu = results['translation_bleu']
        row['Avg BLEU'] = f"{bleu['avg_bleu']:.4f}"
        row['Median BLEU'] = f"{bleu['median_bleu']:.4f}"
        row['Min BLEU'] = f"{bleu['min_bleu']:.4f}"
        row['Max BLEU'] = f"{bleu['max_bleu']:.4f}"
    else:
        row['Avg BLEU'] = 'N/A'
        row['Median BLEU'] = 'N/A'
        row['Min BLEU'] = 'N/A'
        row['Max BLEU'] = 'N/A'
    
    # Accuracy scores
    if 'translation_accuracy' in results and results['translation_accuracy']:
        acc = results['translation_accuracy']
        row['Exact Match'] = f"{acc['exact_match_rate']:.4f}"
        row['Word Acc'] = f"{acc['avg_word_accuracy']:.4f}"
        row['Char Acc'] = f"{acc['avg_char_accuracy']:.4f}"
    else:
        row['Exact Match'] = 'N/A'
        row['Word Acc'] = 'N/A'
        row['Char Acc'] = 'N/A'
    
    # Sentence similarity
    if 'sentence_similarity' in results and results['sentence_similarity']:
        sim = results['sentence_similarity']
        row['Sent Sim'] = f"{sim['avg_similarity']:.4f}"
    else:
        row['Sent Sim'] = 'N/A'
    
    # Parameters
    if 'parameters' in results:
        row['Parameters'] = f"{results['parameters']:,}"
    
    summary_data.append(row)

if summary_data:
    df_summary = pd.DataFrame(summary_data)
    print("\n")
    print(df_summary.to_string(index=False))
    
    # Calculate improvements (Backpack vs Transformer)
    if 'backpack' in all_results and 'transformer' in all_results:
        bp_results = all_results['backpack']
        tf_results = all_results['transformer']
        
        if ('translation_bleu' in bp_results and bp_results['translation_bleu'] and
            'translation_bleu' in tf_results and tf_results['translation_bleu']):
            bp_bleu = bp_results['translation_bleu']['avg_bleu']
            tf_bleu = tf_results['translation_bleu']['avg_bleu']
            improvement = bp_bleu - tf_bleu
            pct_improvement = (improvement / tf_bleu * 100) if tf_bleu > 0 else 0
            
            print(f"\n{'='*70}")
            print("BACKPACK vs TRANSFORMER COMPARISON")
            print(f"{'='*70}")
            print(f"Average BLEU Improvement: {improvement:+.4f} ({pct_improvement:+.1f}%)")
            
            bp_median = bp_results['translation_bleu']['median_bleu']
            tf_median = tf_results['translation_bleu']['median_bleu']
            median_improvement = bp_median - tf_median
            median_pct = (median_improvement / tf_median * 100) if tf_median > 0 else 0
            print(f"Median BLEU Improvement: {median_improvement:+.4f} ({median_pct:+.1f}%)")
    
    # Save summary
    summary_file = os.path.join(BASE_DIR, 'out', 'all_models_evaluation_summary.json')
    os.makedirs(os.path.dirname(summary_file), exist_ok=True)
    with open(summary_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\n‚úì Full results saved to: {summary_file}")
else:
    print("\n‚ö†Ô∏è  No results to display")