# Retrieval evaluation

In [17]:
# Imports
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import h5py
import numpy as np
from dotenv import load_dotenv
from huggingface_hub import login
from openai import OpenAI
import os

## Load the data

In [2]:
# Create a function to load the data and apply all the cleaning developed in the Ingestion notebook
def load_dataset(data_path="../data/ancient_sources.csv.gz"):
    df = pd.read_csv(data_path, compression="gzip")

    # Append index as ID
    df['Id'] = df.index

    # Delete rows without text
    df.drop(df[df["text"].isna()].index, inplace=True)
    
    # Transform rows with NaN section to ""
    df['section'] = df['section'].fillna("")

    # Truncate longest texts
    df["num_words"] = df["text"].apply(count_words)
    df["text"] = df["text"].apply(truncate_texts)

    # Drop number of words column
    df.drop('num_words', axis=1, inplace=True)

    # Transform into a list of dicts
    df_dict = df.to_dict(orient="records")

    # Load embeddings and append to each dict
    embeddings = load_embeddings()
    normalized_embeddings = np.apply_along_axis(normalize_vector, 1, embeddings)
    for i, source in enumerate(df_dict):
        source["text_embedding"] = normalized_embeddings[i]

    return df_dict
    

def count_words(text):
    return len(text.split())


def truncate_texts(text, max_num_words=512):
    word_list = text.split()
    if len(word_list) > max_num_words:
        word_list = word_list[:max_num_words]

    return " ".join(word_list)


def load_embeddings(path="../data/embeddings.h5"):
    with h5py.File(path, 'r') as hf:
        dataset = hf['embeddings']
        
        # Load the data into a NumPy array
        embeddings = dataset[:]
        return embeddings


def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Avoid division by zero
    return vector / norm

In [3]:
# Load the data
df_dict = load_dataset()

## Generate ground truth dataset

In [8]:
# Login in HuggingFace API
load_dotenv()
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
login(huggingface_api_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/codespace/.cache/huggingface/token
Login successful


In [None]:
# Download model and tokenizer
# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "microsoft/Phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to interact with the model
def chat_completions_model(prompt, model, tokenizer):
    # Check if everything can run in GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Tokenize the input text
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate a response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)
    
    # Decode and return the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

In [6]:
# To interact with Ollama
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [11]:
# Function to generate a prompt asking for questions about the data
prompt_template = """
You are an expert historian tasked with generating a question based on the following historical record. 
Your question should be specific and answerable using the information in the record. However, you must avoid repeating too many exact 
words from the record. Focus on key details like dates, persons, events, the author, title, or section. Keep the question concise 
and insightful.

Historical record:

Author: {author}
Title: {title}
Section: {section}
Text: {text}

Question:
""".strip()

def generate_prompt(data, prompt_template=prompt_template):
    return prompt_template.format(**data)

In [12]:
prompt = generate_prompt(df_dict[78])

In [12]:
response = client.chat.completions.create(
    model="llama2",
    messages=[{"role": "user", "content": prompt}]
)

print(response.choices[0].message.content)

InternalServerError: Error code: 500 - {'error': {'message': 'model requires more system memory (8.4 GiB) than is available (5.0 GiB)', 'type': 'api_error', 'param': None, 'code': None}}