# RAG

## 1. Install Packages & Load Models

In [None]:
# Uncomment these lines to install necessary packages
# !pip install llama-index-core
# !pip install llama-index-llms-huggingface
# !pip install llama-index-embeddings-huggingface

In [None]:
# Uncomment these lines if required to choose GPU
import os
import setproctitle
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
setproctitle.setproctitle("python")

In [None]:
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate

import pandas as pd
import torch

In [None]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
# Load huggingface access token (create one if not yet available) to access gated model
!export HUGGINGFACE_TOKEN="TODO"

In [None]:
query_wrapper_prompt = PromptTemplate("[INST] {query_str} [/INST]")  # Specific to Mistral-7B

Settings.llm = HuggingFaceLLM(
    context_window=32768,
    max_new_tokens=512,
    query_wrapper_prompt=query_wrapper_prompt,
    generate_kwargs={"do_sample": False, "pad_token_id": 2},
    model_kwargs={
        "torch_dtype": torch.bfloat16,
    },
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.3",
    model_name="mistralai/Mistral-7B-Instruct-v0.3",
    device_map="auto",
    tokenizer_kwargs={"max_length": 32768},
)

In [None]:
benchmark = pd.read_csv("BX1_chicago_corrected.csv")
# summaries = pd.read_csv("BC1_chicago.csv")  # Necessary for content benchmarks only
benchmark.head()

# 2. Index Documents

In [None]:
def create_content_documents(df):
    # Create documents to index (for content benchmarks)
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

def create_context_documents(df):
    # Create documents to index (for context benchmarks)
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
# Get documents for context benchmarks
documents = create_context_documents(benchmark)

# Get documents for content benchmarks
# documents = create_context_documents(summaries)

In [None]:
vector_index = VectorStoreIndex(documents);
print("Index created")

# 3. Evaluate RAG

In [None]:
def get_query(question: str, k: int):
    # Instruction for the LLM to return relevant dataset(s) in ranked format
    return f"""{question} Provide your response in the following format:
- Datasets: {k} datasets that are relevant to the query (ordered from the most relevant) in a valid Python list format.
- Explanation: Explain briefly why these datasets are relevant to the query."""

In [None]:
import json, re

def extract_and_format_datasets(input_string: str) -> list[str]:
    try:
        # Extract the datasets string
        datasets_string = input_string.split('- Datasets: ')[1].split(']')[0] + ']'
        
        # Check if the datasets string is already a valid JSON format
        try:
            datasets_list = json.loads(datasets_string)
        except json.JSONDecodeError:
            # If not, format it to be a valid JSON list
            datasets_string = re.sub(r'(\w+/\w+-\w+)', r'"\1"', datasets_string)
            datasets_list = json.loads(datasets_string)
        
        # Flatten the list if it is nested (i.e., contains lists within a list)
        if isinstance(datasets_list, list) and all(isinstance(i, list) for i in datasets_list):
            datasets_list = [item for sublist in datasets_list for item in sublist]
        
        return datasets_list
    except:
        # Worst case scenario
        return []

In [None]:
import ast
import re

def evaluate_context_benchmark(benchmark, query_engine, k):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark)):
        question = benchmark["question"][i]
        expected_datasets = ast.literal_eval(benchmark["relevant_tables"][i])
        query = get_query(question, k)
        query_response = query_engine.query(query)
        datasets = extract_and_format_datasets(str(query_response))

        print(datasets)
        for rank, dataset in enumerate(datasets):
            if dataset in expected_datasets:
                accuracy_sum += 1
                if rank == 0:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += (1 / (rank + 1))
                break
        if i % 10 == 0:  # Checkpointing
            print("=" * 50)
            print(f"Index: {i}")
            print(accuracy_sum)
            print(precision_at_1_sum)
            print(reciprocal_rank_sum)
            print("=" * 50)
    return {
        "accuracy": accuracy_sum/len(benchmark),
        "Mean Precision@1": precision_at_1_sum/len(benchmark),
        "MRR": reciprocal_rank_sum/len(benchmark),
    }

In [None]:
import time
start_time = time.time()
result = evaluate_context_benchmark(
    benchmark,
    vector_index.as_query_engine(similarity_top_k=10),  # Adjust k
    10,
)
end_time = time.time()
print(f"Total time elapsed: {end_time-start_time} seconds")
print(f"Result: {result}")