# Analysis LSR vs Lexical Efficiency

We run a set of retrieval engines on both, lexical (I.e., BM25 weights) and learned sparse embeddings to calculate how much, on average the efficiency gap between learned sparse and lexical is.

In [1]:
import pandas as pd

df = pd.read_json("lsr-vs-lexical.jsonl.gz", lines=True)

In [2]:
df[["Retrieval", "lexical", "retrieval_per_query.runtime_wallclock", "nDCG@10"]].groupby(["Retrieval", "lexical"]).describe(percentiles=[0.5, .9, .99])

Unnamed: 0_level_0,Unnamed: 1_level_0,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,retrieval_per_query.runtime_wallclock,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10,nDCG@10
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,50%,90%,99%,max,count,mean,std,min,50%,90%,99%,max
Retrieval,lexical,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
duckdb,False,180.0,32.21208,47.097088,4.68,12.35,108.568,188.717013,274.730337,180.0,0.385041,0.171082,0.074498,0.399728,0.670479,0.751922,0.770543
duckdb,True,15.0,6.706895,1.308715,4.465116,6.755102,8.344,8.5148,8.54,15.0,0.265785,0.173558,0.015273,0.342109,0.461941,0.491123,0.4944
naive-search,False,180.0,22.428912,20.867778,2.139535,9.57,54.820365,73.512842,94.382022,180.0,0.38496,0.171117,0.073129,0.399927,0.670465,0.751922,0.770543
naive-search,True,15.0,3.720603,1.659243,1.116279,3.3,5.264,7.576485,7.94382,15.0,0.265531,0.172538,0.017147,0.341152,0.462418,0.489757,0.492498
pyterrier-splade,False,180.0,31.473835,15.977443,15.16,24.790913,51.59679,89.535169,124.168539,180.0,0.384836,0.170948,0.07261,0.398859,0.669703,0.751707,0.77063
pyterrier-splade,True,15.0,20.964726,3.990952,15.061224,20.259259,26.324,26.818017,26.895833,15.0,0.26648,0.173446,0.015531,0.340266,0.463668,0.492837,0.495772
pyterrier-splade-pisa,False,180.0,10.330056,16.349951,0.74,3.27,31.341417,69.464607,132.550562,180.0,0.384872,0.170988,0.07261,0.398854,0.669703,0.751707,0.77063
pyterrier-splade-pisa,True,15.0,0.832182,0.273555,0.469388,0.76,1.068,1.570387,1.640449,15.0,0.266326,0.173165,0.015531,0.340902,0.463581,0.491103,0.493778
seismic,False,180.0,1.256967,1.131683,0.139535,0.7,2.945438,4.308,5.78,180.0,0.385097,0.170966,0.073129,0.398987,0.670465,0.751922,0.770543
seismic,True,15.0,0.037264,0.009419,0.02,0.04,0.041327,0.054148,0.05618,15.0,0.215713,0.095977,0.079309,0.22383,0.319829,0.352747,0.35768


In [31]:
def fill(f, cnt):
    for _ in range(cnt-len(f)):
        f = '\\phantom{0}' + f
    return f

def line(v):
    df_eval = df[["Retrieval", "lexical", "retrieval_per_query.runtime_wallclock", "nDCG@10"]].groupby(["Retrieval", "lexical"]).describe(percentiles=[0.5, .9, .99]).reset_index()
    df_eval = df_eval[df_eval["Retrieval"] == v]
    ret = []
    for i in [False, True]:
        x = df_eval[df_eval["lexical"] == i]
        assert len(x) == 1
        x = x.to_dict()
        ret += [
            fill("{:.3f}".format(list(x[('retrieval_per_query.runtime_wallclock', '50%')].values())[0]), 6),
            fill("{:.3f}".format(list(x[('retrieval_per_query.runtime_wallclock', '90%')].values())[0]), 8),
            fill("{:.3f}".format(list(x[('retrieval_per_query.runtime_wallclock', '99%')].values())[0]), 8),
            fill("{:.3f}".format(list(x[('nDCG@10', 'mean')].values())[0]), 4),
        ]
    return " & ".join(ret)

for k, v in [("DuckDB", "duckdb"), ("Na{\\\"i}ve", "naive-search"), ("PISA", "pyterrier-splade-pisa"), ("PyTerrier", "pyterrier-splade"), ("Seismic", "seismic")]:
    print(k,"&", line(v), "\\\\")

DuckDB & 12.350 & \phantom{0}108.568 & \phantom{0}188.717 & 0.385 & \phantom{0}6.755 & \phantom{0}\phantom{0}\phantom{0}8.344 & \phantom{0}\phantom{0}\phantom{0}8.515 & 0.266 \\
Na{\"i}ve & \phantom{0}9.570 & \phantom{0}\phantom{0}54.820 & \phantom{0}\phantom{0}73.513 & 0.385 & \phantom{0}3.300 & \phantom{0}\phantom{0}\phantom{0}5.264 & \phantom{0}\phantom{0}\phantom{0}7.576 & 0.266 \\
PISA & \phantom{0}3.270 & \phantom{0}\phantom{0}31.341 & \phantom{0}\phantom{0}69.465 & 0.385 & \phantom{0}0.760 & \phantom{0}\phantom{0}\phantom{0}1.068 & \phantom{0}\phantom{0}\phantom{0}1.570 & 0.266 \\
PyTerrier & 24.791 & \phantom{0}\phantom{0}51.597 & \phantom{0}\phantom{0}89.535 & 0.385 & 20.259 & \phantom{0}\phantom{0}26.324 & \phantom{0}\phantom{0}26.818 & 0.266 \\
Seismic & \phantom{0}0.700 & \phantom{0}\phantom{0}\phantom{0}2.945 & \phantom{0}\phantom{0}\phantom{0}4.308 & 0.385 & \phantom{0}0.040 & \phantom{0}\phantom{0}\phantom{0}0.041 & \phantom{0}\phantom{0}\phantom{0}0.054 & 0.216 \\


# Generate Evaluation DataFrame

The file `lsr-vs-lexical-evaluation.jsonl.gz` used below was created with

```
lsr-benchmark evaluate './*/*/seismic' './*/*/pyterrier-splade-pisa' './*/*/pyterrier-splade' -o lsr-vs-lexical-evaluation.jsonl.gz
```

The results are intermediate, and these steps here slightly enrich the file `lsr-vs-lexical.jsonl.gz` and remove datasets that do not yet have all approaches executed.

In [1]:
import pandas as pd
from lsr_benchmark.datasets import all_embeddings
from tira.rest_api_client import Client
import json

df = pd.read_json('../runs/lsr-vs-lexical-evaluation.jsonl.gz', lines=True)
df["Retrieval"] = df["approach"].apply(lambda i: i.split("/")[-1])
del df["approach"]
APPROACHES = ["pyterrier-splade", "pyterrier-splade-pisa", "seismic", "duckdb", "naive-search"]
tira = Client()
d_stats = {}

for d, d_stat in json.loads(open('../lsr_benchmark/datasets/overview.json').read()).items():
    d_stats[d] = d_stat['dataset_stats']

In [2]:
def normalize_index_runtime(i):
    f = (d_stats[i["tira-dataset-id"]]["docs_count"]/1000)
    if f < 5:
        return None
    return int(i['index.runtime_wallclock'].split(' ')[0]) / f

def normalize_query_runtime(i):
    return int(i['retrieval.runtime_wallclock'].split(' ')[0]) / d_stats[i["tira-dataset-id"]]["queries_count"]

df['index_1000.runtime_wallclock'] = df.apply(normalize_index_runtime, axis=1)
df['retrieval_per_query.runtime_wallclock'] = df.apply(normalize_query_runtime, axis=1)

In [3]:
complete_datasets = set()

print(f"df complete: {len(df)}")

for dataset in df["ir-dataset-id"].unique():
    complete = True
    for approach in APPROACHES:
        if approach in ('pyterrier-naive', 'pyterrier-pisa') and len(df[(df["ir-dataset-id"] == dataset) & (df["Retrieval"] == approach)]) > 0:
            continue

        available_embeddings = df[(df["ir-dataset-id"] == dataset) & (df["Retrieval"] == approach)]["embedding/model"].unique()
        for e in all_embeddings():
            if e not in available_embeddings:
                complete = False
    if complete:
        complete_datasets.add(dataset)

df = df[df["ir-dataset-id"].isin(complete_datasets)]
print(f"df filtered: {len(df)}")

df["lexical"] = df["embedding/model"].map(lambda i: i == "bm25")
df = df[df["Retrieval"].isin(APPROACHES)]
df.to_json('lsr-vs-lexical.jsonl.gz', lines=True, orient="records")

df complete: 1538
df filtered: 1434
