In [None]:
from laminci.db import setup_local_test_postgres

In [None]:
pgurl = setup_local_test_postgres()

In [None]:
!lamin init --name benchmark_search --db {pgurl} --schema bionty --storage ./benchmark_search

In [None]:
import lamindb as ln
import bionty as bt
import pandas as pd
import rbo
from django.test import modify_settings
from django.contrib.postgres.search import SearchQuery, SearchVector, SearchRank

In [None]:
postgres = {"append": "django.contrib.postgres"}

In [None]:
@modify_settings(INSTALLED_APPS=postgres)
def rank_fulltext_postgres(cls, search_str, fields=None):
    if fields is None:
        fields = [field.name for field in cls._meta.fields if field.get_internal_type() in {"CharField", "TextField"}]
    vector = SearchVector(*fields)
    query = SearchQuery(search_str)
    qs = cls.objects.annotate(rank=SearchRank(vector, query)).filter()
    if "ontology_id" not in fields:
        fields.append("ontology_id")
    ls = qs.values(*fields, "rank").list()
    df = pd.DataFrame(ls)[["rank"] + fields]
    return df.set_index("ontology_id").sort_values("rank", ascending=False)

In [None]:
bt.CellType.import_from_source()

In [None]:
SEARCH_QUERIES = ("t cell", "stem cell", "b cell", "neural")
TOP_N = 20

postgres fulltext search vs bionty search

In [None]:
ct_public = bt.CellType.public()

for query in SEARCH_QUERIES:
    print("Query:", query)
    print("postgres search:")
    df_ps = rank_fulltext_postgres(bt.CellType, query, ["name"]).head(TOP_N)
    display(df_ps)
    df_bt = ct_public.search(query)
    df_bt["name"] = df_bt.index
    df_bt = df_bt[["ontology_id", "name"]].set_index("ontology_id").head(TOP_N)
    print("bionty search:")
    display(df_bt)
    rbo_score = rbo.RankingSimilarity(df_ps.index.to_list(), df_bt.index.to_list()).rbo(p=0.9)
    print("RBO:", rbo_score)

postgres fulltext search vs lamindb search

In [None]:
for query in SEARCH_QUERIES:
    print("Query:", query)
    print("postgres search:")
    df_ps = rank_fulltext_postgres(bt.CellType, query).head(TOP_N)
    display(df_ps)
    df_ln = bt.CellType.search(query).df()[["name", "ontology_id", "description", "synonyms"]]
    df_ln = df_ln.set_index("ontology_id").head(TOP_N)
    print("lamindb search:")
    display(df_ln)
    rbo_score = rbo.RankingSimilarity(df_ps.index.to_list(), df_ln.index.to_list()).rbo(p=0.9)
    print("RBO:", rbo_score)

In [None]:
!docker stop pgtest && docker rm pgtest
!lamin delete --force benchmark_search
!rm -r ./benchmark_search