In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning, module="torch")
#warnings.filterwarnings("ignore", message="You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.")

# Read dataset

In [3]:
df_ground_truth_c = pd.read_csv('../data/processed/eval_ground-truth-data_1000q_c.csv') # 'TV & Film', 'Technology'
df_ground_truth_b = pd.read_csv('../data/processed/eval_ground-truth-data_1000q_b.csv') #Technology, True Crime
df_ground_truth_a = pd.read_csv('../data/processed/eval_ground-truth-data_1000q.csv') #Art
#df_ground_truth = pd.concat([df_ground_truth_c,df_ground_truth_b,df_ground_truth_a])
df_ground_truth = pd.concat([df_ground_truth_c])
#df_ground_truth = pd.read_csv('../data/processed/eval_ground-truth-data_10.csv')

#df_ground_truth = pd.read_csv('../data/processed/eval_ground-truth-data_small.csv')
#df_ground_truth = pd.read_csv('../data/processed/eval_ground-truth-data_1500.csv') #1500*5 queries
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'query': 'Latest reviews on Marvel and DC movies',
 'category': 'TV & Film',
 'podcast_id': 'id642680582'}

# Key-based Search

In [4]:
es_client = Elasticsearch('http://localhost:9200')

In [5]:
def elastic_search(query):
    index_name='podcasts_multi-qa-minilm-l6-cos-v1__dims_384' 
    
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description", "name^2"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

# Semantic Search

In [6]:
def elastic_search_knn(field, vector,index_name):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 30,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["description","name","category","id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def description_vector_knn(query,model_embed,index_name):
    v_q = model_embed.encode(query)
    return elastic_search_knn('description_vector', v_q, index_name)

def description_vector_knn_mpnet(query):
    index_name ='podcasts_multi-qa-mpnet-base-dot-v1__dims_768' 
    model_name = 'multi-qa-mpnet-base-dot-v1'
    model_embed = SentenceTransformer(model_name)
    return description_vector_knn(query,model_embed=model_embed,index_name=index_name)
    
def description_vector_knn_miniLM(query):
    index_name = 'podcasts_multi-qa-minilm-l6-cos-v1__dims_384'  
    model_name = 'multi-qa-MiniLM-L6-cos-v1'
    model_embed = SentenceTransformer(model_name)
    return description_vector_knn(query,model_embed=model_embed,index_name=index_name)
    
def description_vector_knn_distilbert(query):
    index_name = 'podcasts_multi-qa-distilbert-cos-v1__dims_768'  
    model_name = 'multi-qa-distilbert-cos-v1'
    model_embed = SentenceTransformer(model_name)
    return description_vector_knn(query,model_embed=model_embed,index_name=index_name)

# Metrics
More Metrics here: [link](https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/evaluation-metrics.md)

## Hit rate (or Recall at k)

Measures the proportion of queries for which at least one relevant document is retrieved in the top k results.

Formula: HR@k = (Number of queries with at least one relevant document in top k) / |Q|

In [7]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

## Mean Reciprocal Rank (MRR)

Evaluates the rank position of the first relevant document

Formula: MRR = (1 / |Q|) * Σ (1 / rank_i) for i = 1 to |Q|

In [8]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [9]:
ground_truth[0]

{'query': 'Latest reviews on Marvel and DC movies',
 'category': 'TV & Film',
 'podcast_id': 'id642680582'}

In [10]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
    #for q in tqdm(ground_truth[0:10]):
        doc_id = q['podcast_id']
        results = search_function(q['query'])
        # print(q['query'])
        # for d in results:
        #     print(d['name'],': ',d['description'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        # print(relevance)
        # print('\n')

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Let's run evaluation for all models

In [11]:
evaluate(ground_truth, elastic_search)

  0%|          | 0/972 [00:00<?, ?it/s]

{'hit_rate': 0.8960905349794238, 'mrr': 0.8198714808282712}

In [12]:
evaluate(ground_truth, description_vector_knn_miniLM)

  0%|          | 0/972 [00:00<?, ?it/s]

{'hit_rate': 0.7098765432098766, 'mrr': 0.5416250244953948}

In [13]:
evaluate(ground_truth, description_vector_knn_mpnet)

  0%|          | 0/972 [00:00<?, ?it/s]

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You t

{'hit_rate': 0.6779835390946503, 'mrr': 0.5097622313671695}

In [14]:
evaluate(ground_truth, description_vector_knn_distilbert)

  0%|          | 0/972 [00:00<?, ?it/s]

{'hit_rate': 0.7067901234567902, 'mrr': 0.535610180286106}