In [1]:
from tira.rest_api_client import Client
from tqdm import tqdm
import gzip
import json

tira = Client()
tira.get_run_output(f'lsr-benchmark/lightning-ir/naver-splade-v3-lexical', "trec-18-web-20251008-test")

PosixPath('/home/maik/.tira/extracted_runs/lsr-benchmark/trec-18-web-20251008-test/lightning-ir/2025-10-08-22-39-15/output')

In [2]:
def du(path):
    import subprocess
    return subprocess.check_output(['du','-s', path]).split()[0].decode('utf-8')

def ds_stats(path):
    queries = 0
    docs = 0
    with gzip.open(path / 'corpus.jsonl.gz', 'rt') as f:
        for l in f:
            docs += 1
    with open(path / 'queries.jsonl', 'r') as f:
        for l in f:
            queries += 1
    return {"docs_count": docs, "queries_count": queries}


In [3]:
EMBEDDINGS = ["naver-splade-v3", "webis-splade", "naver-splade-v3-distilbert", "naver-splade_v2_distil", 
              "naver-splade-v3-doc", "castorini-unicoil-noexp-msmarco-passage", "opensearch-project-opensearch-neural-sparse-encoding-doc-v2-mini",
              "opensearch-project-opensearch-neural-sparse-encoding-doc-v3-distill", "naver-splade-v3-lexical",
              "opensearch-project-opensearch-neural-sparse-encoding-doc-v2-distill", "opensearch-project-opensearch-neural-sparse-encoding-v2-distill",
              "bge-m3",
                "bm25"]

DATASETS = ["trec-robust-2004-fold-5-20250926-test", "trec-robust-2004-fold-4-20250926-test", "trec-robust-2004-fold-3-20250926-test", 
            "trec-robust-2004-fold-2-20250926-test", "trec-robust-2004-fold-1-20250927-test", "trec-33-rag-20250926_1-training", 
            "trec-29-deep-learning-passages-20250926-training", "trec-28-misinfo-20251008_1-test", 
            "trec-28-deep-learning-passages-20250926-training", "trec-23-web-20251008-test",
            "trec-22-web-20251008-test", "trec-21-web-20251008-test", "trec-20-web-20251008-test",
            "trec-19-web-20251008-test", "trec-18-web-20251008-test", 
            "tiny-example-20251002_0-training"
             ]
SIZES = {}

for dataset in DATASETS:
    embedding_sizes = {}
    for embedding in tqdm(EMBEDDINGS, dataset):
        embedding_sizes[embedding] = du(tira.get_run_output(f'lsr-benchmark/lightning-ir/{embedding}', dataset))
    SIZES[dataset] = {
        'dataset-size': du(tira.download_dataset(task='lsr-benchmark', dataset=dataset)),
        'dataset_stats': ds_stats(tira.download_dataset(task='lsr-benchmark', dataset=dataset)),
        "embedding-sizes": embedding_sizes}

with open('../lsr_benchmark/datasets/overview.json', 'w') as f:
    f.write(json.dumps(SIZES))

trec-robust-2004-fold-5-20250926-test: 100%|██████████| 13/13 [00:02<00:00,  5.93it/s]
trec-robust-2004-fold-4-20250926-test: 100%|██████████| 13/13 [00:01<00:00,  6.55it/s]
trec-robust-2004-fold-3-20250926-test: 100%|██████████| 13/13 [00:02<00:00,  6.42it/s]
trec-robust-2004-fold-2-20250926-test: 100%|██████████| 13/13 [00:02<00:00,  5.80it/s]
trec-robust-2004-fold-1-20250927-test: 100%|██████████| 13/13 [00:07<00:00,  1.79it/s]
trec-33-rag-20250926_1-training: 100%|██████████| 13/13 [00:01<00:00,  6.64it/s]
trec-29-deep-learning-passages-20250926-training: 100%|██████████| 13/13 [00:01<00:00,  6.70it/s]
trec-28-misinfo-20251008_1-test: 100%|██████████| 13/13 [00:03<00:00,  3.94it/s]
trec-28-deep-learning-passages-20250926-training: 100%|██████████| 13/13 [00:02<00:00,  5.91it/s]
trec-23-web-20251008-test: 100%|██████████| 13/13 [00:02<00:00,  6.25it/s]
trec-22-web-20251008-test: 100%|██████████| 13/13 [00:02<00:00,  4.97it/s]
trec-21-web-20251008-test: 100%|██████████| 13/13 [00:01<

# Table 1

In [2]:
DATASET_IDS = {
    "trec-robust-2004": "Disks~4/5",
    "trec-33-rag": "MS~MARCO$_{2.1}$",
    "deep-learning-passages": "MS~MARCO",
    "misinfo": "ClueWeb12",
    "trec-23-web": "ClueWeb12",
    "trec-22-web": "ClueWeb12",
    "trec-21-web": "ClueWeb09",
    "trec-20-web": "ClueWeb09",
    "trec-19-web": "ClueWeb09",
    "trec-18-web": "ClueWeb09",
}

def dataset_id(m):
    ret = set()
    for k,v in DATASET_IDS.items():
        if k in m:
            ret.add(v)
    assert len(ret) == 1, f"{m}, {ret}"
    return list(ret)[0]

In [14]:
STATS_FOR_TABLE = {}
all_stats = json.loads(open('../lsr_benchmark/datasets/overview.json', 'r').read())

ALL_EMBEDDINGS = []

for k in all_stats:
    if "tiny-example" in k:
        continue
    d = dataset_id(k)
    if d not in STATS_FOR_TABLE:
        STATS_FOR_TABLE[d] = {'docs_count': 0, 'queries_count': 0, "embedding-sizes": []}
    STATS_FOR_TABLE[d]["docs_count"] += all_stats[k]["dataset_stats"]["docs_count"]
    STATS_FOR_TABLE[d]["queries_count"] += all_stats[k]["dataset_stats"]["queries_count"]
    STATS_FOR_TABLE[d]["embedding-sizes"] += [int(v) for v in all_stats[k]['embedding-sizes'].values()]
    ALL_EMBEDDINGS += [int(v) for v in all_stats[k]['embedding-sizes'].values()]

In [15]:
JUDGMENTS = {
    "ClueWeb09": "\\phantom{0}84\\,366",
    "ClueWeb12": "\\phantom{0}51\\,765",
    "Disks~4/5": "311\\,410",
    "MS~MARCO": "\\phantom{0}20\\,646",
    "MS~MARCO$_{2.1}$": "\\phantom{0}20\\,429"
}

def line(c):
    emb_avg = sum(STATS_FOR_TABLE[c]["embedding-sizes"])/(len(STATS_FOR_TABLE[c]["embedding-sizes"])*1024)
    emb_all = sum(STATS_FOR_TABLE[c]["embedding-sizes"])/(1024*1024)
    ret = [STATS_FOR_TABLE[c]["queries_count"], JUDGMENTS[c], STATS_FOR_TABLE[c]["docs_count"], "{:.1f}".format(emb_avg) + "\,MB", "{:.1f}".format(emb_all) + "\,GB"]
    return " & ".join([str(i) for i in ret])

print("""
ClueWeb09 & Web~\\cite{clarke:2009,clarke:2010,clarke:2011,clarke:2012} & """ + line("ClueWeb09") +  """ \\\\

ClueWeb12 & Web/Dec.~\\cite{abualsaud:2019,thompson:2013,thompson:2014} & """ + line("ClueWeb12") +  """ \\\\

Disks~4/5 & Robust04~\\cite{voorhees:2004} & """ + line("Disks~4/5") +  """ \\\\

MS~MARCO & DL~19/20~\\cite{craswell:2020,craswell:2019} & """ + line("MS~MARCO") +  """ \\\\

MS~MARCO$_{2.1}$ & RAG~24~\\cite{upadhyay:2025} & """ + line("MS~MARCO$_{2.1}$") +  """ \\\\
""")
print("All & " + "{:.1f}".format(sum(ALL_EMBEDDINGS)/(len(ALL_EMBEDDINGS)*1024)) +  "\,MB" + " & " "{:.1f}".format(sum(ALL_EMBEDDINGS)/(1024*1024)) +  "\,GB")


ClueWeb09 & Web~\cite{clarke:2009,clarke:2010,clarke:2011,clarke:2012} & 198 & \phantom{0}84\,366 & 315095 & 117.9\,MB & 5.1\,GB \\

ClueWeb12 & Web/Dec.~\cite{abualsaud:2019,thompson:2013,thompson:2014} & 150 & \phantom{0}51\,765 & 225636 & 116.0\,MB & 3.7\,GB \\

Disks~4/5 & Robust04~\cite{voorhees:2004} & 249 & 311\,410 & 285756 & 106.4\,MB & 5.7\,GB \\

MS~MARCO & DL~19/20~\cite{craswell:2020,craswell:2019} & 97 & \phantom{0}20\,646 & 81737 & 40.1\,MB & 0.9\,GB \\

MS~MARCO$_{2.1}$ & RAG~24~\cite{upadhyay:2025} & 89 & \phantom{0}20\,429 & 116694 & 170.0\,MB & 1.8\,GB \\

All & 106.8\,MB & 17.2\,GB
