In [110]:
import os
import json
import yaml

import numpy as np
import pandas as pd
import requests

from elasticsearch import Elasticsearch

In [111]:
es = Elasticsearch(hosts="http://localhost:9200")

In [130]:
with open("config.yml") as f:
  config = yaml.safe_load(f)

print(config['models'].keys())

def _get_index_name(model_name):
  alias = config["models"][model_name]["index_name"]
  return list(es.indices.get_alias(name=alias).keys())[0]

# print(_get_index_name('e5-small'))

dict_keys(['e5-small', 'e5-base', 'e5-large', 'e5-instruct', 'minilm', 'seznam-mpnet'])


In [113]:
def get_embeddings(model_name, query):
    api_url = config["models"][model_name]["endpoint"]
    api_token = os.getenv("INFERENCE_API_TOKEN")

    if "e5-instruct" in model_name:
        query = f'Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: {query}'
    elif "e5" in model_name:
        query = f"query: {query}"

    response = requests.post(
        api_url,
        headers={"Authorization": f"Bearer {api_token}"} if api_token else {},
        params={"wait_for_model": True},
        json={"inputs": query},
        timeout=300,
    )

    if not response.ok:
        raise Exception(response.json()["error"])

    return np.array(response.json()).flatten().tolist()

In [114]:
input_index_name = 'e5-instruct'

input_ratings = [
  {"title": "Staroměstský orloj",      "rating": 5, "_id": "suUx340Bb3AqLNPefoTf"},
  {"title": "Orloj",                   "rating": 4, "_id": "6-Ux340Bb3AqLNPefoLe"},
  {"title": "Mistr Hanuš",             "rating": 3, "_id": "aegy340Bb3AqLNPeMUWQ"},
  {"title": "Mikuláš z Kadaně",        "rating": 3, "_id": "cecy340Bb3AqLNPeFtSK"},
  {"title": "Staroměstská radnice",    "rating": 2, "_id": "eOUx340Bb3AqLNPem_Ce"},
  {"title": "Staroměstské náměstí",    "rating": 2, "_id": "KeUx340Bb3AqLNPeiKIW"},
  {"title": "Hodinová věž",            "rating": 1, "_id": "BOcy340Bb3AqLNPeELeA"},
  {"title": "Jakub Čech (hodinář)",    "rating": 1, "_id": "peky340Bb3AqLNPefocl"},
  {"title": "Cisiojan",                "rating": 1, "_id": "5eUx340Bb3AqLNPedHIA"},
  {"title": "Brněnský orloj",          "rating": 0, "_id": "oOcy340Bb3AqLNPeFtWK"},
  {"title": "Olomoucký orloj",         "rating": 0, "_id": "XeUx340Bb3AqLNPeiaoE"},
  {"title": "Ostravský orloj",         "rating": 0, "_id": "ouky340Bb3AqLNPegpcZ"},
  {"title": "Pohádkový orloj",         "rating": 0, "_id": "gucx340Bb3AqLNPe_We0"},
  {"title": "Kadaňský orloj",          "rating": 0, "_id": "0eoy340Bb3AqLNPeyo8T"},
  {"title": "Hospodský orloj",         "rating": 0, "_id": "duky340Bb3AqLNPehaSJ"},
  {"title": "Chmelový orloj",          "rating": 0, "_id": "0-wz340Bb3AqLNPeVyP5"},
  {"title": "Ovocný trh (Praha)",      "rating": 0, "_id": "e-cy340Bb3AqLNPeCpVV"},
  {"title": "Nábřeží Maxipsa Fíka",    "rating": 0, "_id": "zuoy340Bb3AqLNPevWUA"},
  {"title": "Seznam gymnázií v Česku", "rating": 0, "_id": "ZOcy340Bb3AqLNPeEsSL"},
  {"title": "Seznam ulic v Praze",     "rating": 0, "_id": "P-wz340Bb3AqLNPetut7"},]

In [115]:
# es.get(index=_get_index_name('e5-instruct'), id=input_ratings[12]['_id'])['_source']['title']
# es.search(index="wikipedia-search-v5", q='title:"Staroměstský orloj"')

In [116]:
output_ratings = {}

for model_name in config['models'].keys():
  output_ratings[model_name] = []
  for item in input_ratings:
    doc = es.get(index=_get_index_name(input_index_name), id=item['_id'])

    title = doc['_source']['title']
    doc_id = es.search(index=_get_index_name(model_name), q=f'title:"{title}"')['hits']['hits'][0]['_id']
    # print(title, _get_index_name(model_name), doc_id)

    rating = {
      "title": title,
      "rating": item['rating'],
      "_index": _get_index_name(model_name),
      "_id": doc_id,
    }
    output_ratings[model_name].append(rating)

# output_ratings

In [117]:
query_lexical = lambda query: {
    "query": {
        "nested": {
            "path": "parts",
            "query": {
                "match": {
                    "parts.chunk": query,
                },
            },
        }
    },
}

# query_lexical('foo')

In [118]:
query_semantic = lambda model_name, query: {
    "query": {
        "nested": {
            "path": "parts",
            "query": {
                "knn": {
                    "field": "parts.embedding",
                    "query_vector": get_embeddings(model_name, query),
                    "num_candidates": 1000,
                }
            }
        }
    },
}

# query_semantic('e5-small', 'foo')

In [119]:
eval_query = "staroměstský orloj"
eval_size = 10

def run_evaluations(client, definitions):
    responses = []
    for item in definitions:
        item = item.copy()
        responses.append(
            client.rank_eval(
                index=item.pop("index"),
                metric={"dcg": {"k": eval_size, "normalize": True}},
                requests=item,
            )
        )
    return responses

In [120]:
definitions = []

definitions.append(
    {
        "id": "lexical",
        "index": _get_index_name("e5-instruct"),
        "request": query_lexical(eval_query),
        "ratings": [
            {
                "_index": _get_index_name("e5-instruct"),
                "_id": item["_id"],
                "rating": item["rating"],
            }
            for item in input_ratings
        ],
    }
)

for model_name, ratings in output_ratings.items():
    definitions.append({
        "id": model_name,
        "index": _get_index_name(model_name),
        "request": query_semantic(model_name, eval_query),
        "ratings": [
            {
                "_index": _get_index_name(model_name),
                "_id": item["_id"],
                "rating": item["rating"],
            }
            for item in output_ratings[model_name]
        ]
    })

# definitions

In [121]:
eval_results = [ dict(res) for res in run_evaluations(es, definitions) ]

# print(yaml.dump(eval_results[1]['details']))

In [122]:
print(f"nDCG@{eval_size}".rjust(25), '-'*50, sep="\n")
for item in eval_results:
  for name, details in item['details'].items():
    print(f"{name:>15}  ", details['metric_score'])

                  nDCG@10
--------------------------------------------------
        lexical   0.16687993983753535
       e5-small   0.8768083590039725
        e5-base   0.5798168917598391
       e5-large   0.8532545915999133
    e5-instruct   0.8067346399279625
         minilm   0.0
   seznam-mpnet   0.0


In [123]:
for result in eval_results:
    for id, res in result['details'].items():
        index_name = res['hits'][0]['hit']['_index']
        doc_ids = [hit['hit']['_id'] for hit in res['hits']]

        docs = es.mget(
            index=index_name, ids=doc_ids, source_includes=['title', 'url'], filter_path=['docs._id', 'docs._source']
        )['docs']

        for i, doc in enumerate(res['hits']): doc.update({
            "title": docs[i]['_source']['title'],
            "url": docs[i]['_source']['url'],
        })

In [124]:
# eval_results[0]['details']

In [125]:
results = [
  {
    "metric": metric,
    "metric_name": f"nDCG@{eval_size}",
    "score": details['metric_score'],
    "hits": [
      {
        "title": item['title'], "url": item['url'],
      } for item in details['hits']
    ],
  } for result in eval_results for metric, details in result['details'].items()
]

In [126]:
with open("tmp/rank_eval_results.json", "w+") as f: json.dump(results, f, ensure_ascii=False, indent=2)

In [127]:
results.sort(key=lambda x: x['score'], reverse=True)

df = pd.DataFrame(data={
  f"{result['metric']} / {result['score']:.3f}": [hit['title'] for hit in result['hits'] ] for result in results
})

from IPython.display import Markdown
display(Markdown(df.reset_index(drop=True).to_markdown(index=False,tablefmt="grid")))

+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| e5-small / 0.877                         | e5-large / 0.853              | e5-instruct / 0.807           | e5-base / 0.580           | lexical / 0.167               | minilm / 0.000             | seznam-mpnet / 0.000                  |
+==========================================+===============================+===============================+===========================+===============================+============================+=======================================+
| Staroměstský orloj                       | Staroměstský orloj            | Staroměstský orloj            | Ovocný trh (Praha)        | Kadaňský orloj                | Ruské kolo                 | Bosiljevo                             |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Orloj                                    | Orloj                         | Orloj                         | Staroměstský orloj        | Jakub Čech (hodinář)          | Dlask tlustozobý           | Bělečko                               |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Jakub Čech (hodinář)                     | Ovocný trh (Praha)            | Slovenský orloj               | Orloj                     | Pohádkový orloj v Ostravě     | Čečetka tmavá              | Česká Rybná                           |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Mistr Hanuš                              | Dům U Bílého orla (Rytířská)  | Dům U Červeného orla (Jilská) | Pohádkový orloj v Ostravě | Chmelový orloj                | Komerční aranžování květin | Oldřiš (rozcestník)                   |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Pokojový orloj Jana Maška                | Dům U Červeného orla (Jilská) | Ostravský orloj               | Chmelový orloj            | Jan Táborský z Klokotské Hory | Helheim                    | Seznam budov na Staroměstském náměstí |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Slovenský orloj                          | Kadaňský orloj                | Pohádkový orloj v Ostravě     | Křižovnické náměstí       | Orloj                         | Křížová cesta (Žulová)     | Skupice (Postoloprty)                 |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Pohádkový orloj v Ostravě                | Mistr Hanuš                   | Stará radnice (Prachatice)    | Očnice                    | Ludvík Hainz                  | NGC 6752                   | Na Perštýně (Praha)                   |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Jan Táborský z Klokotské Hory            | Slovenský orloj               | Dům U Bílého orla (Rytířská)  | Slovenský orloj           | Seznam osobností Prahy        | Hyperbola                  | Římskokatolická farnost Vrbice        |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Olomoucký orloj                          | Pohádkový orloj v Ostravě     | Jan Táborský z Klokotské Hory | Pokojový orloj Jana Maška | Mistr Hanuš                   | Vratislav Bělík            | Velké Přílepy                         |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+
| Chronologie starověkého Blízkého východu | Staré Město (Praha)           | Kadaňský orloj                | Mistr Hanuš               | Cisiojan                      | Paježura Attenboroughova   | Stará Ves (Stará Ves nad Ondřejnicí)  |
+------------------------------------------+-------------------------------+-------------------------------+---------------------------+-------------------------------+----------------------------+---------------------------------------+

### Debug

In [128]:
# r = run_evaluations(es, definitions[4:5])
# print(yaml.dump(r[0].body))

In [129]:
# r = es.search(index="wikipedia-search-v5", body=query_semantic('minilm', eval_query), _source_excludes=["parts","url"])
# print(yaml.dump(r.body['hits']['hits']))