# Run situational embedding benchmarks locally

## Install dependencies

In [None]:
# Run once
!pip install sentence-transformers datasets scipy scikit-learn pandas tqdm

## Setup & logging

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
    stream=sys.stdout,
)

# Make sure the benchmark package is on the path
# (if running from the repo root this is not needed)
import os, sys
sys.path.insert(0, os.path.abspath(".."))

## Inspect available tasks

In [None]:
from benchmark import TASK_REGISTRY

for name, task in TASK_REGISTRY.items():
    print(f"  {name:20s}  →  {task.description}")

## Define models to benchmark

Models are plain dicts — no code changes needed to add/remove models.

In [None]:
MODEL_CONFIGS = [
    {
        "type": "sentence_transformer",
        "model": "sentence-transformers/all-MiniLM-L6-v2",
        # Fast, ~80 MB — good baseline
    },
    {
        "type": "sentence_transformer",
        "model": "BAAI/bge-small-en-v1.5",
        # Strong small model from BAAI
    },
    # Uncomment to add OpenAI (requires OPENAI_API_KEY):
    # {
    #     "type": "openai",
    #     "model": "text-embedding-3-small",
    # },
]

# Which tasks to run
TASK_NAMES = ["sts", "retrieval"]   # add "clustering" for a heavier run

## Run the benchmark

In [None]:
from pathlib import Path
from benchmark import BenchmarkRunner

runner = BenchmarkRunner(
    model_configs=MODEL_CONFIGS,
    task_names=TASK_NAMES,
    output_dir=Path("results"),
    cache_dir=Path(".cache/embeddings"),
    batch_size=128,
    show_progress=True,
)

results = runner.run()