# Retrieval evaluation

In [2]:
# Imports
import pandas as pd
import h5py
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
import os
import random
from tqdm.notebook import trange, tqdm
import pickle
import gzip
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import torch

  from tqdm.autonotebook import tqdm, trange


## 1 - Load the data

In [3]:
# Create a function to load the data and apply all the cleaning developed in the Ingestion notebook
def load_dataset(data_path="../data/ancient_sources.csv.gz"):
    df = pd.read_csv(data_path, compression="gzip")

    # Append index as ID
    df['Id'] = df.index

    # Delete rows without text
    df.drop(df[df["text"].isna()].index, inplace=True)
    
    # Transform rows with NaN section to ""
    df['section'] = df['section'].fillna("")

    # Truncate longest texts
    df["num_words"] = df["text"].apply(count_words)
    df["text"] = df["text"].apply(truncate_texts)

    # Drop number of words column
    df.drop('num_words', axis=1, inplace=True)

    # Transform into a list of dicts
    df_dict = df.to_dict(orient="records")

    # Load embeddings and append to each dict
    embeddings = load_embeddings()
    normalized_embeddings = np.apply_along_axis(normalize_vector, 1, embeddings)
    for i, source in enumerate(df_dict):
        source["text_embedding"] = normalized_embeddings[i]

    return df_dict
    

def count_words(text):
    return len(text.split())


def truncate_texts(text, max_num_words=512):
    word_list = text.split()
    if len(word_list) > max_num_words:
        word_list = word_list[:max_num_words]

    return " ".join(word_list)


def load_embeddings(path="../data/embeddings.h5"):
    with h5py.File(path, 'r') as hf:
        dataset = hf['embeddings']
        
        # Load the data into a NumPy array
        embeddings = dataset[:]
        return embeddings


def load_title_embeddings(path="../data/title_embeddings.h5"):
    with h5py.File('../data/title_embeddings.h5', 'r') as hf:
        dataset = hf['title_embeddings']
    
        # Load the data into a NumPy array
        title_embeddings_array = dataset[:]
        return title_embeddings_array


def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

In [4]:
# Load the data
df_dict = load_dataset()

## 2 - Generate ground truth dataset

In [5]:
# Login keys
load_dotenv()

True

In [6]:
# To interact with OpenAI
client = OpenAI()

In [7]:
# Function to generate a prompt asking for questions about the data
prompt_template = """
You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author or title. Keep the question concise 
and insightful.

Historical record:

Author: {author}
Title: {title}
Text: {text}

Question:
""".strip()

def generate_prompt(data, prompt_template=prompt_template):
    return prompt_template.format(**data)

In [8]:
def generate_question(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

Let's generate 1000 questions about 1000 randomly chosen ancient sources in our list:

In [9]:
# Filter only the records with a relevant text (let's say at least 20 words)
df_dict_filtered = list(filter(lambda x: len(x["text"].split()) >= 20, df_dict))

In [15]:
# Choose 1000 random records
random_elements = random.sample(df_dict_filtered, 1000)

In [18]:
ground_truth = []

for elem in tqdm(random_elements):
    prompt = generate_prompt(elem)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/1000 [00:00<?, ?it/s]

Let's generate another 500 questions that ask for the author and title:

In [40]:
prompt_template_author = """
You are a skilled history student. Your task is to create a question about the provided historical text that 
focuses on identifying the authors and titles that discuss the events and persons mentioned in the text. Your question 
should encourage an in-depth search for relevant historical references, avoiding direct reuse of the text's wording.

Historical text: {text}

Question:
""".strip()

In [28]:
# Choose 500 random records
random_elements = random.sample(df_dict_filtered, 500)

In [41]:
for elem in tqdm(random_elements):
    prompt = generate_prompt(elem, prompt_template=prompt_template_author)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/500 [00:00<?, ?it/s]

And finally another 500 more generic questions:

In [None]:
prompt_template_generic = """
Imagine you are a history student who is just starting to learn about ancient texts. Your task is to create a straightforward question 
based on the provided historical text. The question should be simple and directly answerable using the information given in the text and its title.

Title: {title}
Text: {text}

Question:
""".strip()

In [43]:
# Choose 500 random records
random_elements = random.sample(df_dict_filtered, 500)

In [44]:
for elem in tqdm(random_elements):
    prompt = generate_prompt(elem, prompt_template=prompt_template_generic)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/500 [00:00<?, ?it/s]

Finally, let's add another 1000 questions with a different prompt:

In [None]:
prompt_template_mine = """

Title: {title}
Text: {text}

Question:
""".strip()

In [45]:
# Shuffle the list in place
random.shuffle(ground_truth)

# Save the ground_truth_dataset
with gzip.open('../data/ground_truth.pkl.gz', 'wb') as f:
    pickle.dump(ground_truth, f, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
# Load the data back
with gzip.open('../data/ground_truth.pkl.gz', 'rb') as f:
    ground_truth = pickle.load(f)

## 3 - Rewrite search functions

In [6]:
# Elastic client
es_client = Elasticsearch("http://localhost:9200")

# The index should be already created and the info indexed in the container
index_name = "ancient_sources_db_index"

In [7]:
# Load the msmarco-roberta-base-ance-firstp for the query embeddings
model = SentenceTransformer('msmarco-roberta-base-ance-firstp')

# The vectors have to be normalized in order to use dot product similarity in elasticsearch
def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

3_LayerNorm/config.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

3_LayerNorm/model.safetensors:   0%|          | 0.00/6.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/7.14k [00:00<?, ?B/s]

In [9]:
def es_search(query, num_results=5, boost=0.5, num_candidates=1000):
    q_embedding = model.encode(query)
    q_emb_normalized = normalize_vector(q_embedding)

    # Vector search query
    knn_query = {
        "field": "text_embedding",
        "query_vector": q_emb_normalized,
        "k": num_results,
        "num_candidates": num_candidates,
        "boost": boost,
    }
    
    # Semantic search query
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["author", "title", "text"],
                    "type": "best_fields",
                    "boost": boost,
                }
            }
        }
    }

    response = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=num_results,
        _source={
            "excludes": ["text_embedding"]  # Exclude the embedding vectors of the response
        }
    )

    return response["hits"]["hits"]

## 4 - Hit Rate (HR) and Mean Reciprocal Rank (MRR)

The initial computed HR and MRR (without any tunning) were:
- **HR = 0.5295**
- **MRR = 0.4226**

Let's see if these metrics can be improved.

In [17]:
relevance_total = []

for query in tqdm(ground_truth):
    results = es_search(query["question"])
    relevance = [query["id"]==r["_source"]["Id"] for r in results]
    relevance_total.append(relevance)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [18]:
def hit_rate(relevance_total):
    count = 0
    
    for line in relevance_total:
        if True in line:
            count = count + 1

    hr = count / len(relevance_total)
    return hr


def mrr(relevance_total):
    total_score = 0.0
    
    for line in relevance_total:
        for i in range(len(line)):
            if line[i] == True:
                total_score = total_score + 1/(i+1)

    mrr = total_score / len(relevance_total)
    return mrr    

In [19]:
hr = hit_rate(relevance_total)
mrr = mrr(relevance_total)

In [20]:
print(f"Hit-rate: {hr}")
print(f"MRR: {mrr}")

Hit-rate: 0.5295
MRR: 0.4226333333333342
