In [None]:

!git clone https://github.com/imabeastdrew/Martydepth.git
%cd Martydepth
%pip install -e .

In [None]:
# Import required libraries
import torch
import numpy as np
import wandb
import json
import yaml
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add project root to Python path
import sys
sys.path.append('.')

# Import project modules
from src.data.dataset import create_dataloader
from src.evaluation.evaluate import load_model_from_wandb as load_online_model
from src.evaluation.evaluate_offline import load_model_from_wandb as load_offline_model
from src.evaluation.temporal_evaluation import (
    generate_online_temporal,
    generate_offline_temporal,
    calculate_test_set_baseline_temporal,
    log_temporal_metrics,
    create_wandb_comparison_dashboard
)

print("All imports successful!")
print("📊 Using pure WandB logging for temporal evaluation visualizations")


In [None]:
# Load configuration from YAML file
config_path = 'src/evaluation/configs/temporal_evaluation.yaml'

print(f"Loading configuration from: {config_path}")
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Add notebook-specific run name
config['run_name'] = 'temporal_evaluation_notebook'

print("Configuration loaded from YAML:")
for key, value in config.items():
    print(f"  {key}: {value}")
    
print(f"\n🎯 Using your configured model artifacts:")
print(f"  Online:  {config['online_artifact']}")
print(f"  Offline: {config['offline_artifact']}")
print(f"\n🚀 Ready for pure WandB temporal evaluation!")


In [None]:
# Setup device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Device: {device}")


In [None]:
# Load models from WandB artifacts
print("=== Loading Models ===")

# Load online model
print(f"Loading online model from: {config['online_artifact']}")
online_model, online_tokenizer_info, online_config = load_online_model(config['online_artifact'], device)
online_model.eval()
print("✅ Online model loaded successfully")

# Load offline model  
print(f"Loading offline model from: {config['offline_artifact']}")
offline_model, offline_config, offline_tokenizer_info = load_offline_model(config['offline_artifact'], device)
offline_model.eval()
print("✅ Offline model loaded successfully")

# Use online tokenizer info (should be consistent)
tokenizer_info = online_tokenizer_info

# Verify tokenizer consistency
if online_tokenizer_info != offline_tokenizer_info:
    print("⚠️  WARNING: Online and offline models have different tokenizer info!")
    print("This may cause evaluation issues.")
else:
    print("✅ Tokenizer info is consistent between models")

print(f"\n📊 Ready to evaluate {len(config['scenarios'])} scenarios over {config['max_beats']} beats")


In [None]:
# Create data loaders
print("=== Creating Data Loaders ===")

# Get sequence lengths from model configs
online_max_seq_length = online_config.get('max_seq_length') or online_config.get('max_sequence_length') or 512
offline_max_seq_length = offline_config.get('max_seq_length') or offline_config.get('max_sequence_length') or 256

print(f"Online model max sequence length: {online_max_seq_length}")
print(f"Offline model max sequence length: {offline_max_seq_length}")

# Create online dataloader
online_dataloader, _ = create_dataloader(
    data_dir=Path(config['data_dir']),
    split=config['split'],
    batch_size=1,  # Use batch size 1 for temporal evaluation
    num_workers=0,
    sequence_length=online_max_seq_length,
    mode='online',
    shuffle=False
)

# Create offline dataloader
offline_dataloader, _ = create_dataloader(
    data_dir=Path(config['data_dir']),
    split=config['split'],
    batch_size=1,  # Use batch size 1 for temporal evaluation
    num_workers=0,
    sequence_length=offline_max_seq_length,
    mode='offline',
    shuffle=False
)

print(f"✅ Created dataloaders for split: '{config['split']}'")
print(f"📦 Online dataloader: {len(online_dataloader)} batches")
print(f"📦 Offline dataloader: {len(offline_dataloader)} batches")


In [None]:
# Calculate test set baseline
print("=== Calculating Test Set Baseline ===")
baseline_results = calculate_test_set_baseline_temporal(
    online_dataloader, tokenizer_info, max_beats=config['max_beats']
)
print("✅ Baseline calculation completed")

# Run temporal evaluation for online model
print("\n=== Evaluating Online Model ===")
online_results = generate_online_temporal(
    model=online_model,
    dataloader=online_dataloader,
    tokenizer_info=tokenizer_info,
    device=device,
    scenarios=config['scenarios'],
    max_beats=config['max_beats'],
    temperature=config['temperature'],
    top_k=config['top_k'],
    perturbation_beat=config['perturbation_beat']
)
print("✅ Online model evaluation completed")

# Run temporal evaluation for offline model
print("\n=== Evaluating Offline Model ===")
offline_results = generate_offline_temporal(
    model=offline_model,
    dataloader=offline_dataloader,
    tokenizer_info=tokenizer_info,
    device=device,
    scenarios=config['scenarios'],
    max_beats=config['max_beats'],
    temperature=config['temperature'],
    top_k=config['top_k'],
    perturbation_beat=config['perturbation_beat']
)
print("✅ Offline model evaluation completed")

print(f"\n🎯 All evaluations completed! Ready for WandB logging.")
print(f"📊 Methodology matches research paper: perturbation at beat {config['perturbation_beat']} with melody transposition")


In [None]:
# Log individual model results to WandB
print("=== Logging Results to WandB ===")

# Log online model results
print("📊 Logging online model results...")
log_temporal_metrics(online_results, "online", config['run_name'], config['wandb_project'])

# Log offline model results
print("📊 Logging offline model results...")
log_temporal_metrics(offline_results, "offline", config['run_name'], config['wandb_project'])

# Log baseline results
print("📊 Logging baseline results...")
log_temporal_metrics({"baseline": baseline_results}, "baseline", config['run_name'], config['wandb_project'])

print("✅ Individual model results logged to WandB")


In [None]:
# Create comprehensive comparison dashboard
print("\n=== Creating WandB Comparison Dashboard ===")
create_wandb_comparison_dashboard(
    online_results=online_results,
    offline_results=offline_results,
    baseline_results=baseline_results,
    project_name=config['wandb_project'],
    run_name=f"{config['run_name']}_comparison"
)

print("✅ Comprehensive comparison dashboard created!")
print(f"\n🎉 Temporal evaluation completed successfully!")
print(f"📈 Check your WandB project '{config['wandb_project']}' for interactive visualizations!")
print(f"🔗 Look for these runs:")
print(f"   • {config['run_name']}_online")
print(f"   • {config['run_name']}_offline") 
print(f"   • {config['run_name']}_baseline")
print(f"   • {config['run_name']}_comparison")
