# Retrieval evaluation

In [15]:
# Imports
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import h5py
import numpy as np
from dotenv import load_dotenv
import os

In [16]:
# Load environment variables from the .env file
load_dotenv()

True

In [17]:
print(os.getenv('HUGGINGFACE_API_KEY'))

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


## Load the data

In [6]:
# Create a function to load the data and apply all the cleaning developed in the Ingestion notebook
def load_dataset(data_path="../data/ancient_sources.csv.gz"):
    df = pd.read_csv(data_path, compression="gzip")

    # Append index as ID
    df['Id'] = df.index

    # Delete rows without text
    df.drop(df[df["text"].isna()].index, inplace=True)
    
    # Transform rows with NaN section to ""
    df['section'] = df['section'].fillna("")

    # Truncate longest texts
    df["num_words"] = df["text"].apply(count_words)
    df["text"] = df["text"].apply(truncate_texts)

    # Drop number of words column
    df.drop('num_words', axis=1, inplace=True)

    # Transform into a list of dicts
    df_dict = df.to_dict(orient="records")

    # Load embeddings and append to each dict
    embeddings = load_embeddings()
    normalized_embeddings = np.apply_along_axis(normalize_vector, 1, embeddings)
    for i, source in enumerate(df_dict):
        source["text_embedding"] = normalized_embeddings[i]

    return df_dict
    

def count_words(text):
    return len(text.split())


def truncate_texts(text, max_num_words=512):
    word_list = text.split()
    if len(word_list) > max_num_words:
        word_list = word_list[:max_num_words]

    return " ".join(word_list)


def load_embeddings(path="../data/embeddings.h5"):
    with h5py.File(path, 'r') as hf:
        dataset = hf['embeddings']
        
        # Load the data into a NumPy array
        embeddings = dataset[:]
        return embeddings


def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

In [7]:
# Load the data
df_dict = load_dataset()

## Generate ground truth dataset

In [8]:
# Function to interact with LLaMA 2
def chat_completions_llama2(prompt, size="small"):
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    
    if size == "large":
        model_name = "meta-llama/Llama-2-70b-chat-hf"
    elif size == "medium":
        model_name = "meta-llama/Llama-2-13b-chat-hf"

    # Load the LLaMA 2 model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Check if everything can run in GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Tokenize the input text
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate a response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)
    
    # Decode and return the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [9]:
# Function to generate a prompt asking for questions about the data
prompt_template = """
You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author, title, or section. Keep the question concise 
and insightful.

Historical record:

Author: {author}
Title: {title}
Section: {section}
Text: {text}
""".strip()

def generate_prompt(data, prompt_template=prompt_template):
    return prompt_template.format(**data)

In [10]:
print(generate_prompt(df_dict[78]))

You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author, title, or section. Keep the question concise 
and insightful.

Historical record:

Author: M. Tullius Cicero
Title: Divination
Section: Divination Book I, section 3
Text: [ 3 ] And, indeed, what colony did Greece ever send into Aeolia, Ionia, Asia, Sicily, or Italy without consulting the Pythian or Dodonian oracle, or that of Jupiter Hammon? Or what war did she ever undertake without first seeking the counsel of the gods? 2. Nor is it only one single mode of divination that has been employed in public and in private. For, to say nothing of other nations, how many our own people have embraced! In the first place, according to tradition, Romulus, the father of this Cit

In [18]:
chat_completions_llama2(generate_prompt(df_dict[78]))

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-66ddc44d-5648de7c171143e35a086e8a;7595a70f-7280-464c-8998-cf658c2113c0)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it.