# Retrieval evaluation

In [15]:
# Imports
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
import pandas as pd
import h5py
import numpy as np
from dotenv import load_dotenv
#from huggingface_hub import login
from openai import OpenAI
import os
import random
from tqdm.notebook import trange, tqdm

## Load the data

In [3]:
# Create a function to load the data and apply all the cleaning developed in the Ingestion notebook
def load_dataset(data_path="../data/ancient_sources.csv.gz"):
    df = pd.read_csv(data_path, compression="gzip")

    # Append index as ID
    df['Id'] = df.index

    # Delete rows without text
    df.drop(df[df["text"].isna()].index, inplace=True)
    
    # Transform rows with NaN section to ""
    df['section'] = df['section'].fillna("")

    # Truncate longest texts
    df["num_words"] = df["text"].apply(count_words)
    df["text"] = df["text"].apply(truncate_texts)

    # Drop number of words column
    df.drop('num_words', axis=1, inplace=True)

    # Transform into a list of dicts
    df_dict = df.to_dict(orient="records")

    # Load embeddings and append to each dict
    embeddings = load_embeddings()
    normalized_embeddings = np.apply_along_axis(normalize_vector, 1, embeddings)
    for i, source in enumerate(df_dict):
        source["text_embedding"] = normalized_embeddings[i]

    return df_dict
    

def count_words(text):
    return len(text.split())


def truncate_texts(text, max_num_words=512):
    word_list = text.split()
    if len(word_list) > max_num_words:
        word_list = word_list[:max_num_words]

    return " ".join(word_list)


def load_embeddings(path="../data/embeddings.h5"):
    with h5py.File(path, 'r') as hf:
        dataset = hf['embeddings']
        
        # Load the data into a NumPy array
        embeddings = dataset[:]
        return embeddings


def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

In [4]:
# Load the data
df_dict = load_dataset()

## Generate ground truth dataset

In [5]:
# Login keys
load_dotenv()

True

In [6]:
# To interact with OpenAI
client = OpenAI()

In [35]:
# Function to generate a prompt asking for questions about the data
prompt_template = """
You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author or title. Keep the question concise 
and insightful.

Historical record:

Author: {author}
Title: {title}
Text: {text}

Question:
""".strip()

def generate_prompt(data, prompt_template=prompt_template):
    return prompt_template.format(**data)

In [31]:
def generate_question(source):
    prompt = generate_prompt(source)
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

Let's generate 500 questions about 500 randomly chosen ancient sources in our list:

In [12]:
ground_truth = []
random_elements = random.sample(df_dict, 500)

for elem in tqdm(random_elements):
    question = generate_question(elem)
    question_with_answer_id = {"id": elem["id"], "question": question}
    ground_truth.append(question_with_answer_id)