# Run situational embedding benchmarks locally

## Install dependencies

In [1]:
# Run once
!pip install sentence-transformers datasets scipy scikit-learn pandas tqdm

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading multiprocess-0.70.18-py312-none-any.whl (150 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.4.0-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xxhash-3.6.0-cp

## Setup & logging

In [2]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
    stream=sys.stdout,
)

# Make sure the benchmark package is on the path
# (if running from the repo root this is not needed)
import os, sys
sys.path.insert(0, os.path.abspath(".."))

## Inspect available tasks

In [3]:
from benchmark import TASK_REGISTRY

for name, task in TASK_REGISTRY.items():
    print(f"  {name:20s}  →  {task.description}")

  sts                   →  Semantic Textual Similarity – STS-Benchmark
  retrieval             →  Small synthetic retrieval benchmark
  clustering            →  Clustering on AG News (4 categories)
  likert_wvs            →  Likert continuum benchmark using WVS statements


## Define models to benchmark

Models are plain dicts — no code changes needed to add/remove models.

In [4]:
MODEL_CONFIGS = [
    {
        "type": "sentence_transformer",
        "model": "sentence-transformers/all-MiniLM-L6-v2",
        # Fast, ~80 MB — good baseline
    },
    {
        "type": "sentence_transformer",
        "model": "BAAI/bge-small-en-v1.5",
        # Strong small model from BAAI
    },
    # Uncomment to add OpenAI (requires OPENAI_API_KEY):
    # {
    #     "type": "openai",
    #     "model": "text-embedding-3-small",
    # },
]

# Which tasks to run
TASK_NAMES = ["sts", "retrieval"]   # add "clustering" for a heavier run

## Run the benchmark

In [5]:
from pathlib import Path
from benchmark import BenchmarkRunner

runner = BenchmarkRunner(
    model_configs=MODEL_CONFIGS,
    task_names=TASK_NAMES,
    output_dir=Path("results"),
    cache_dir=Path(".cache/embeddings"),
    batch_size=128,
    show_progress=True,
)

results = runner.run()

17:03:19  INFO      
Model: sentence-transformers/all-MiniLM-L6-v2
17:03:19  INFO        Task: STS-B
17:03:21  INFO      Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
17:03:21  INFO      NumExpr defaulting to 8 threads.
17:03:34  INFO      [STS-B] Loading dataset sentence-transformers/stsb …


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/471k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

17:03:40  INFO      Loading sentence-transformers/all-MiniLM-L6-v2 …
17:03:40  INFO      Use pytorch device_name: mps
17:03:40  INFO      Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

17:03:47  INFO      Cached embeddings → .cache/embeddings/sentence-transformers__all-MiniLM-L6-v2__STS-B__d97c4b095262.npy
17:03:47  INFO        → main_score=0.8203  ({'pearson': 0.8273996773144039, 'spearman': 0.8203013947512559, 'main_score': 0.8203013947512559})
17:03:47  INFO        Task: Retrieval-Demo


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

17:03:47  INFO      Cached embeddings → .cache/embeddings/sentence-transformers__all-MiniLM-L6-v2__Retrieval-Demo__corpus__f6466f369234.npy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

17:03:48  INFO      Cached embeddings → .cache/embeddings/sentence-transformers__all-MiniLM-L6-v2__Retrieval-Demo__queries__8c4639200222.npy
17:03:48  INFO        → main_score=0.9252  ({'ndcg@10': 0.9251724527673773, 'recall@10': 1.0, 'main_score': 0.9251724527673773})
17:03:48  INFO      
Model: BAAI/bge-small-en-v1.5
17:03:48  INFO        Task: STS-B
17:03:48  INFO      [STS-B] Loading dataset sentence-transformers/stsb …
17:03:50  INFO      Loading BAAI/bge-small-en-v1.5 …
17:03:50  INFO      Use pytorch device_name: mps
17:03:50  INFO      Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

17:03:56  INFO      Cached embeddings → .cache/embeddings/BAAI__bge-small-en-v1.5__STS-B__d97c4b095262.npy
17:03:56  INFO        → main_score=0.8586  ({'pearson': 0.8431414417689026, 'spearman': 0.858641331743407, 'main_score': 0.858641331743407})
17:03:56  INFO        Task: Retrieval-Demo


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

17:03:56  INFO      Cached embeddings → .cache/embeddings/BAAI__bge-small-en-v1.5__Retrieval-Demo__corpus__f6466f369234.npy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

17:03:56  INFO      Cached embeddings → .cache/embeddings/BAAI__bge-small-en-v1.5__Retrieval-Demo__queries__8c4639200222.npy
17:03:56  INFO        → main_score=1.0000  ({'ndcg@10': 1.0, 'recall@10': 1.0, 'main_score': 1.0})
17:03:56  INFO      
Summary written to results/summary.json
