# Analysis LSR vs Lexical Efficiency

We run a set of retrieval engines on both, lexical (I.e., BM25 weights) and learned sparse embeddings to calculate how much, on average the efficiency gap between learned sparse and lexical is.

In [1]:
import pandas as pd

df = pd.read_json("lsr-vs-lexical.jsonl.gz", lines=True)

In [2]:
df[["Retrieval", "lexical", "retrieval_per_query.runtime_wallclock", "nDCG@10"]].groupby(["Retrieval", "lexical"]).describe(percentiles=[0.5, .9, .99])

Unnamed: 0_level_0,Unnamed: 1_level_0,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,50%,90%,99%,max,count,mean,std,min,50%,90%,99%,max
Retrieval,lexical,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
duckdb,False,60.0,33.114083,47.859442,4.68,12.1125,102.136,177.533767,185.583333,60.0,0.211922,0.060793,0.074498,0.209316,0.301201,0.321396,0.322907
duckdb,True,5.0,7.009333,1.438391,5.12,6.666667,8.468,8.5328,8.54,5.0,0.059803,0.048122,0.015273,0.039865,0.11319,0.128775,0.130507
kannolo,False,55.0,0.938318,0.163283,0.6,0.9,1.14,1.337917,1.416667,55.0,0.221168,0.052955,0.094183,0.211056,0.301118,0.322258,0.323214
naive-search,False,60.0,24.105472,22.533124,3.36,10.04,58.13,73.783008,74.58,60.0,0.211899,0.060842,0.073129,0.209583,0.302223,0.321536,0.321995
pyserini-lsr,False,60.0,21.766153,6.326525,12.333333,19.33,30.824,39.5454,40.1,60.0,0.212891,0.06041,0.077175,0.208594,0.299901,0.323193,0.326889
pyserini-lsr,True,5.0,137.551667,52.565645,82.28,127.2,190.535333,219.681533,222.92,5.0,0.060878,0.047728,0.015643,0.045508,0.113253,0.129386,0.131179
pyterrier-naive,False,5.0,17.206667,1.399008,15.82,16.66,18.736,19.2736,19.333333,5.0,0.130909,0.087462,0.050402,0.07905,0.228467,0.240874,0.242253
pyterrier-pisa,False,5.0,0.616333,0.163148,0.4,0.62,0.776,0.8336,0.84,5.0,0.11553,0.081492,0.038302,0.071597,0.206518,0.222334,0.224092
pyterrier-splade,False,60.0,29.824514,13.668734,15.32,24.307083,48.542,71.379567,80.44,60.0,0.212162,0.061369,0.07261,0.209314,0.302058,0.329814,0.330053
pyterrier-splade,True,5.0,22.607167,3.149283,19.72,21.34,26.1055,26.8168,26.895833,5.0,0.060485,0.048076,0.015531,0.042995,0.113597,0.128705,0.130384


# Generate Evaluation DataFrame

The file `lsr-vs-lexical-evaluation.jsonl.gz` used below was created with

```
lsr-benchmark evaluate './*/*/seismic' './*/*/pyterrier-splade-pisa' './*/*/pyterrier-splade' -o lsr-vs-lexical-evaluation.jsonl.gz
```

The results are intermediate, and these steps here slightly enrich the file `lsr-vs-lexical.jsonl.gz` and remove datasets that do not yet have all approaches executed.

In [1]:
import pandas as pd
from lsr_benchmark.datasets import all_embeddings
from tira.rest_api_client import Client
import json

df = pd.read_json('../runs/lsr-vs-lexical-evaluation.jsonl.gz', lines=True)
df["Retrieval"] = df["approach"].apply(lambda i: i.split("/")[-1])
del df["approach"]
APPROACHES = ["pyterrier-splade", "pyterrier-splade-pisa", "seismic", "duckdb"]
tira = Client()
d_stats = {}

for d, d_stat in json.loads(open('../lsr_benchmark/datasets/overview.json').read()).items():
    d_stats[d] = d_stat['dataset_stats']

In [2]:
def normalize_index_runtime(i):
    f = (d_stats[i["tira-dataset-id"]]["docs_count"]/1000)
    if f < 5:
        return None
    return int(i['index.runtime_wallclock'].split(' ')[0]) / f

def normalize_query_runtime(i):
    return int(i['retrieval.runtime_wallclock'].split(' ')[0]) / d_stats[i["tira-dataset-id"]]["queries_count"]

df['index_1000.runtime_wallclock'] = df.apply(normalize_index_runtime, axis=1)
df['retrieval_per_query.runtime_wallclock'] = df.apply(normalize_query_runtime, axis=1)

In [3]:
complete_datasets = set()

print(f"df complete: {len(df)}")

for dataset in df["ir-dataset-id"].unique():
    complete = True
    for approach in APPROACHES:
        if approach in ('pyterrier-naive', 'pyterrier-pisa') and len(df[(df["ir-dataset-id"] == dataset) & (df["Retrieval"] == approach)]) > 0:
            continue

        available_embeddings = df[(df["ir-dataset-id"] == dataset) & (df["Retrieval"] == approach)]["embedding/model"].unique()
        for e in all_embeddings():
            if e not in available_embeddings:
                complete = False
    if complete:
        complete_datasets.add(dataset)

df = df[df["ir-dataset-id"].isin(complete_datasets)]
print(f"df filtered: {len(df)}")

df["lexical"] = df["embedding/model"].map(lambda i: i == "bm25")
df.to_json('lsr-vs-lexical.jsonl.gz', lines=True, orient="records")

df complete: 1525
df filtered: 1421


In [4]:
df["Retrieval"].unique()

array(['pyterrier-splade', 'naive-search', 'pytorch-naive',
       'pyterrier-splade-pisa', 'seismic', 'kannolo', 'duckdb',
       'pyterrier-naive', 'pyterrier-pisa', 'pyserini-lsr'], dtype=object)