# Retrieval evaluation

In [24]:
# Imports
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
import pandas as pd
import h5py
import numpy as np
from dotenv import load_dotenv
#from huggingface_hub import login
from openai import OpenAI
import os
import random
from tqdm.notebook import trange, tqdm
import pickle
import gzip

## Load the data

In [2]:
# Create a function to load the data and apply all the cleaning developed in the Ingestion notebook
def load_dataset(data_path="../data/ancient_sources.csv.gz"):
    df = pd.read_csv(data_path, compression="gzip")

    # Append index as ID
    df['Id'] = df.index

    # Delete rows without text
    df.drop(df[df["text"].isna()].index, inplace=True)
    
    # Transform rows with NaN section to ""
    df['section'] = df['section'].fillna("")

    # Truncate longest texts
    df["num_words"] = df["text"].apply(count_words)
    df["text"] = df["text"].apply(truncate_texts)

    # Drop number of words column
    df.drop('num_words', axis=1, inplace=True)

    # Transform into a list of dicts
    df_dict = df.to_dict(orient="records")

    # Load embeddings and append to each dict
    embeddings = load_embeddings()
    normalized_embeddings = np.apply_along_axis(normalize_vector, 1, embeddings)
    for i, source in enumerate(df_dict):
        source["text_embedding"] = normalized_embeddings[i]

    return df_dict
    

def count_words(text):
    return len(text.split())


def truncate_texts(text, max_num_words=512):
    word_list = text.split()
    if len(word_list) > max_num_words:
        word_list = word_list[:max_num_words]

    return " ".join(word_list)


def load_embeddings(path="../data/embeddings.h5"):
    with h5py.File(path, 'r') as hf:
        dataset = hf['embeddings']
        
        # Load the data into a NumPy array
        embeddings = dataset[:]
        return embeddings


def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

In [3]:
# Load the data
df_dict = load_dataset()

## Generate ground truth dataset

In [4]:
# Login keys
load_dotenv()

True

In [5]:
# To interact with OpenAI
client = OpenAI()

In [6]:
# Function to generate a prompt asking for questions about the data
prompt_template = """
You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author or title. Keep the question concise 
and insightful.

Historical record:

Author: {author}
Title: {title}
Text: {text}

Question:
""".strip()

def generate_prompt(data, prompt_template=prompt_template):
    return prompt_template.format(**data)

In [34]:
def generate_question(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

Let's generate 1000 questions about 1000 randomly chosen ancient sources in our list:

In [None]:
# Filter only the records with a relevant text (let's say at least 20 words)
df_dict_filtered = list(filter(lambda x: len(x["text"].split()) >= 20, df_dict))

In [15]:
# Choose 1000 random records
random_elements = random.sample(df_dict_filtered, 1000)

In [18]:
ground_truth = []

for elem in tqdm(random_elements):
    prompt = generate_prompt(elem)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/1000 [00:00<?, ?it/s]

Let's generate another 500 questions that ask for the author and title:

In [40]:
prompt_template_author = """
You are a skilled history student. Your task is to create a question about the provided historical text that 
focuses on identifying the authors and titles that discuss the events and persons mentioned in the text. Your question 
should encourage an in-depth search for relevant historical references, avoiding direct reuse of the text's wording.

Historical text: {text}

Question:
""".strip()

In [28]:
# Choose 500 random records
random_elements = random.sample(df_dict_filtered, 500)

In [41]:
for elem in tqdm(random_elements):
    prompt = generate_prompt(elem, prompt_template=prompt_template_author)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/500 [00:00<?, ?it/s]

And finally another 500 more generic questions:

In [42]:
prompt_template_generic = """
Imagine you are a history student who is just starting to learn about ancient texts. Your task is to create a straightforward question 
based on the provided historical text. The question should be simple and directly answerable using the information given in the text and its title.

Title: {title}
Text: {text}

Question:
""".strip()

In [43]:
# Choose 500 random records
random_elements = random.sample(df_dict_filtered, 500)

In [44]:
for elem in tqdm(random_elements):
    prompt = generate_prompt(elem, prompt_template=prompt_template_generic)
    question = generate_question(prompt)
    question_with_answer_id = {"id": elem["Id"], "question": question}
    ground_truth.append(question_with_answer_id)

  0%|          | 0/500 [00:00<?, ?it/s]

In [45]:
# Shuffle the list in place
random.shuffle(ground_truth)

# Save the ground_truth_dataset
with gzip.open('../data/ground_truth.pkl.gz', 'wb') as f:
    pickle.dump(ground_truth, f, protocol=pickle.HIGHEST_PROTOCOL)