# Sample: Running the benchmarks

This notebook shows how to run each of the eight benchmarks. Ensure the project root is on your path, load your model and tokenizer, and define the NLL callables. Each section below runs one benchmark.

In [1]:
# Setup: add project root to path and set benchmark suite location
import sys
from pathlib import Path
import numpy as np

PROJECT_ROOT = Path.cwd()  # or Path('/path/to/CognitiveBenchmarking')
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

PATH_TO_BENCHMARK_SUITE = PROJECT_ROOT / 'data' / 'benchmark_suite'
PATH_TO_BENCHMARK_SUITE = str(PATH_TO_BENCHMARK_SUITE) + '/'  # scripts expect trailing slash

In [2]:
# Load model and tokenizer (adjust model path as needed)
import torch
from amads.expectation.tokenizer import TSDTokenizer_Custom

model_path = PROJECT_ROOT.parent / 'lstm_atepp_not_bad_quality_embedd23_hidden64_300epochs_batch4_lr0.01_tokenizer_fixed.pth'
model = torch.load(model_path, map_location='cpu')
model.eval()
model.device = 'cpu'

custom_params = {'time_range': (0.01, 1), 'time_factor': 1}
tokenizer = TSDTokenizer_Custom(config_params=custom_params)

In [3]:
# Define NLL callables for the benchmarks (used by 6 of the 8 benchmarks)
def get_mean_nll(midi_path):
    tokens = tokenizer.tokenize(midi_path)
    predictions = model.predict_sequence(tokens)
    nlls = [nll for nll in predictions.nlls if nll is not None]
    return float(np.mean(nlls))

def get_total_nll(midi_path):
    tokens = tokenizer.tokenize(midi_path)
    predictions = model.predict_sequence(tokens)
    nlls = [nll for nll in predictions.nlls if nll is not None]
    return float(np.sum(nlls))

## 1. Cadence prediction

Mean percentile of tonic resolution across 12 keys (higher = better).

In [4]:
from benchmarks import run_cadence_prediction_benchmark
result = run_cadence_prediction_benchmark(get_mean_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Cadence prediction — mean percentile:', result)

Cadence prediction — mean percentile: 0.8958333333333334


## 2. Scale filling

Mean percentile of correct scale completion across keys (higher = better). Uses total NLL with control subtraction (total NLL of a scale in the absence of chord context).

In [5]:
from benchmarks import run_scale_filling_benchmark
result = run_scale_filling_benchmark(get_total_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Scale filling — mean percentile:', result)

Scale filling — mean percentile: 0.7283333333333334


## 3. Interval recognition

Mean percentile of correct interval across 25 interval types and 5 permutations (higher = better).

In [6]:
from benchmarks import run_interval_recognition_benchmark
result = run_interval_recognition_benchmark(get_mean_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False, n_perm=5)
print('Interval recognition — mean percentile:', result)

Interval recognition — mean percentile: 0.5766666666666667


## 4. Transposition invariance

Mean correlation of interval NLL vectors between adjacent starting notes (higher = more invariant).

In [7]:
from benchmarks import get_transposition_invariance
result = get_transposition_invariance(get_mean_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Transposition invariance — mean adjacent-note correlation:', result)

Transposition invariance — mean adjacent-note correlation: 0.519195675494432


## 5. Human melody continuation (Lokyan t_01)

Mean Spearman correlation between model NLL and human ratings for melody continuations (higher = better alignment).

In [8]:
from benchmarks import human_melody_comparison
result = human_melody_comparison(get_mean_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Human melody continuation — mean correlation:', result)

Human melody continuation — mean correlation: 0.7545058303704535


## 6. Human chord alignment (Lokyan t_03)

Pearson correlation between model mean NLL and human harmony ratings (higher = better alignment).

In [9]:
from benchmarks import human_chord_comparison
result = human_chord_comparison(get_mean_nll, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Human chord alignment — Pearson correlation:', result)

Human chord alignment — Pearson correlation: 0.6867327876262146


## 7. Glass (The Hours)

Requires precomputed NLL over time for the Glass piece. Spearman correlation between model surprisal at event onsets and human surprise ranks (higher = better).

In [10]:
import pandas as pd
from benchmarks import run_glass_benchmark

# Load precomputed NLL over time (e.g. from your model on glass.mid)
# Columns expected: 'Time (s)', 'NLL-moving sum over 1s window' (or equivalent)
glass_csv = PROJECT_ROOT.parent / 'NLL_over_time_Glass_MT.csv'  # adjust path
glass_df = pd.read_csv(glass_csv, index_col=0)
times = glass_df['Time (s)'].values.flatten()
surprisal = glass_df['NLL-moving sum over 1s window'].values.flatten()

result = run_glass_benchmark(times, surprisal, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Glass benchmark — Spearman correlation:', result)

Glass benchmark — Spearman correlation: 0.5053330096541302


## 8. Mussorgsky (Night on Bald Mountain)

Requires precomputed NLL over time for the piece. Spearman correlation between model surprisal at event onsets and human surprise ranks (higher = better).

In [11]:
import pandas as pd
from benchmarks import run_mussorgsky_benchmark

mussorgsky_csv = PROJECT_ROOT.parent / 'NLL_over_time_Mussorgsky_MT.csv'  # adjust path
mussorgsky_df = pd.read_csv(mussorgsky_csv, index_col=0)
times = mussorgsky_df['Time (s)'].values.flatten()
surprisal = mussorgsky_df['NLL-moving sum over 1s window'].values.flatten()

result = run_mussorgsky_benchmark(times, surprisal, PATH_TO_BENCHMARK_SUITE, return_all_results=False)
print('Mussorgsky benchmark — Spearman correlation:', result)

Mussorgsky benchmark — Spearman correlation: 0.4367548635254413
