In [1]:
import os
import json

import pandas as pd

In [2]:
from tqdm.auto import tqdm

In [3]:
from sentence_transformers import SentenceTransformer

model_name, dim_model = "all-mpnet-base-v2", 768
model = SentenceTransformer(model_name)

In [4]:
model.encode('Hello World').shape

(768,)

In [5]:
from dotenv import load_dotenv

if os.path.isfile(".env"):
    print("using .env file")
    load_dotenv()

using .env file


In [6]:
df_ground_truth = pd.read_csv('df_ground_truth.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [7]:
ground_truth[0]

{'document_id': '8bfa3b6f784445636db49a39649fd346',
 'question': 'What makes the 5th Marine Regiment Sniper Platoon a standout group during the Vietnam War according to the author?',
 'title': '13 Cent Killers: The 5th Marine Snipers in Vietnam',
 'authors': "['John Culbertson']",
 'categories': "['History']"}

In [8]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [9]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [10]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [11]:
def elastic_search_knn(vector, es_client, field="query_vector", es_index="default"):
    search_query = {
        "knn": {
            "field": field,
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
        },
        "_source": [
            "title",
            "review_summary",   
            "review_text",
            "description",
            "authors",
            "publisher",
            "categories",
            "review_score",
            "document_id",
        ]
    }
    es_results = es_client.search(
        index=es_index,
        body=search_query
    )
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [12]:
from elasticsearch import Elasticsearch

ELASTICSEARCH_URI = os.getenv('ELASTICSEARCH_URL', 'http://localhost:9200')
ELASTICSEARCH_INDEX = os.getenv('ELASTICSEARCH_INDEX', 'default-index-name')
es_client = Elasticsearch(ELASTICSEARCH_URI) 

In [13]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    gt = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['document_id'] == doc_id for d in results]
        relevance_total.append(relevance)
        gt.append(doc_id)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'relevance': relevance_total,
        'ground_truth': gt
    }

In [14]:
def question_text_vector_knn(q):
    question = q['question']
    v_q = model.encode(question)
    return elastic_search_knn(vector=v_q, es_client=es_client, field='text_vector', es_index=ELASTICSEARCH_INDEX)

In [15]:
results = evaluate(ground_truth, question_text_vector_knn)

  0%|          | 0/992 [00:00<?, ?it/s]

In [16]:
{
    'hit_rate': results['hit_rate'],
    'mrr': results['mrr'],
}

{'hit_rate': 0.3709677419354839, 'mrr': 0.286021505376344}