In [34]:
import os
import json

import pandas as pd

In [35]:
from tqdm.auto import tqdm

In [36]:
from sentence_transformers import SentenceTransformer

model_name, dim_model = "all-mpnet-base-v2", 768
model = SentenceTransformer(model_name)



In [37]:
model.encode('Hello World').shape

(768,)

In [38]:
from dotenv import load_dotenv

if os.path.isfile(".env"):
    load_dotenv()

In [39]:
df_ground_truth = pd.read_csv('df_ground_truth.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [40]:
ground_truth[0]

{'document_id': '77612833d7fd891bbd5300974dd06ec6',
 'question': 'What evidence does Lee Strobel present to support the idea that Jesus rose from the dead?',
 'title': 'The Case for Easter: Journalist Investigates the Evidence for the Resurrection',
 'authors': "['Lee Strobel']",
 'categories': "['Religion']"}

In [41]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [42]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [43]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [44]:
def elastic_search_knn(vector, es_client, field="query_vector", es_index="default"):
    search_query = {
        "knn": {
            "field": field,
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
        },
        "_source": [
            "title",
            "review_summary",   
            "review_text",
            "description",
            "authors",
            "publisher",
            "categories",
            "review_score",
            "document_id",
        ]
    }
    es_results = es_client.search(
        index=es_index,
        body=search_query
    )
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [45]:
from elasticsearch import Elasticsearch

ELASTICSEARCH_URI = os.getenv('ELASTICSEARCH_URI', 'http://localhost:9200')
ELASTICSEARCH_INDEX = os.getenv('ELASTICSEARCH_INDEX', 'default-index-name')
es_client = Elasticsearch(ELASTICSEARCH_URI) 

In [46]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    gt = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['document_id'] == doc_id for d in results]
        relevance_total.append(relevance)
        gt.append(doc_id)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'relevance': relevance_total,
        'ground_truth': gt
    }

In [47]:
def question_text_vector_knn(q):
    question = q['question']
    v_q = model.encode(question)
    return elastic_search_knn(vector=v_q, es_client=es_client, field='text_vector', es_index=ELASTICSEARCH_INDEX)

In [48]:
results = evaluate(ground_truth, question_text_vector_knn)

100%|██████████| 1000/1000 [00:46<00:00, 21.40it/s]


In [49]:
{
    'hit_rate': results['hit_rate'],
    'mrr': results['mrr'],
}

{'hit_rate': 0.0, 'mrr': 0.0}