# Implementation, Testing and Evaluation for Optimal Retriever in RAG

#### Notebook Outline
1. Imports and Configurations
2. Creation of Vector Database
3. Querying the Vector Database
4. Output of RAG Experiments
5. Evaluations

This notebook uses functions from the Baseline RAG .ipynb file and adapts these.

### 1. Imports and Configurations

Imports

In [None]:
# === Standard Library Imports ===
import json
import os
import sys

# === Third-Party Libraries ===
from tqdm import tqdm
from dotenv import load_dotenv

# === OpenAI Integration ===
import openai
from openai import OpenAI
from langchain_openai import OpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# === LangChain Core Modules ===
from langchain.schema import Document
from langchain.docstore.document import Document
from langchain_core.documents import Document
from langchain.retrievers import BM25Retriever, TFIDFRetriever, EnsembleRetriever

# === Type Hints ===
from typing import Any, List

# === Project Root Configuration ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# === Local Project Modules ===
from ipynb_notebooks.baseline.rag_utils.baseline_rag import (
    load_documents_for_sparse_retrieval,
    load_vector_database,
    generate_answer,
    translate_query_to_german_if_needed
)

from ipynb_notebooks.evaluation_datasets.retrieval_eval.eval_vector_dataset_generator import generate_evalset
from ipynb_notebooks.evaluation_datasets.retrieval_eval.retrieval_metrics import run_retrieval_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation


Configurations

In [None]:
# Load environment variables. Assumes that the project directory contains a .env file with API keys
load_dotenv()

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']

# Define constants for paths
DATA_PATH = "../../data/laws_and_ordinances.json"  # Directory containing the url to the law and ordinance documents
DATA_PATH_SHORT_VERSION = "../../data/laws_and_ordinances_short_version.json" # Directory containing a subset of all urls for testing purposes
CHROMA_PATH = "chroma_dbs/chroma"  # Directory to save the Chroma vector store

### 2. Creation of Vector Databases

**Why Creating Separate Chroma Databases for Each Retrieval Strategy Is Not Necessary**

In contrast to chunking experiments, evaluating different retrieval strategies does not require generating separate Chroma vector databases. This is because all strategies operate over the same underlying document corpus and embeddings. Retrieval techniques such as Dense Similarity Search, BM25, TF-IDF, Multi-Query, or Hybrid approaches differ only in how they search or rank the embedded documents—not in how the documents are chunked or stored.

As long as the Chroma DB is generated using a consistent chunking strategy and embedding model, it provides a shared semantic space that is sufficient for fair comparison across retrieval methods. Creating separate vector stores per strategy would introduce unnecessary redundancy and would not improve the validity of the evaluation.

In [None]:
chroma_db_optimal_retrieval_method = "../chroma_dbs/chroma_chunksize1024_overlap128_c800ccc6_optimal_retrieval_method"

### 3. Retrieval Methods

In [None]:
def create_multi_query(base_prompt: str, n_variants: int) -> List[str]:
    model = OpenAI(model="gpt-4o-mini")

    # Geänderter System- und Human-Prompt
    system_msg = SystemMessagePromptTemplate.from_template(
        "You are an assistant that rewrites search phrases into alternative forms to broaden retrieval coverage."
    )
    human_msg = HumanMessagePromptTemplate.from_template(
        "Please create {n_variants} alternative search queries based on the following input: {base_prompt}. "
        "List each query on a new line without numbering or bullet points."
    )

    prompt_chain = ChatPromptTemplate.from_messages([system_msg, human_msg])
    formatted_messages = prompt_chain.format_prompt(base_prompt=base_prompt, n_variants=n_variants).to_messages()
    
    final_input = formatted_messages[0].content + "\n" + formatted_messages[1].content
    response = model(final_input)
    
    query_set = [base_prompt]
    query_set += response.strip().split("\n")

    return query_set

def create_multi_query_list(prompt_batch: List[str], n_variants: int, output_name: str) -> List[List[str]]:
    collected_queries = []

    for idx, original_prompt in enumerate(tqdm(prompt_batch, desc="Generating query variants")):
        variants = create_multi_query(original_prompt, n_variants)
        collected_queries.append({
            "prompt_id": idx + 1,
            "query_variants": variants
        })

    output_file = f"../retrieval_inputs/multi_queries/{output_name}.json"

    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(collected_queries, file, ensure_ascii=False, indent=2)

    return [entry["query_variants"] for entry in collected_queries]


In [None]:
def create_document_hypo_docs(query: str) -> Document:
    llm_model = OpenAI(model="gpt-4o-mini")

    system_prompt = SystemMessagePromptTemplate.from_template(
        """You are a specialist in German energy legislation. Formulate a concise explanatory paragraph (3–4 sentences) that addresses the legal question below in a clear and expert tone."""
    )
    user_prompt = HumanMessagePromptTemplate.from_template(
        "Legal Query: {question}"
    )

    prompt_template = ChatPromptTemplate.from_messages([system_prompt, user_prompt])
    formatted_input = prompt_template.format_prompt(question=query).to_messages()
    full_prompt = formatted_input[0].content + "\n" + formatted_input[1].content

    generated_response = llm_model(full_prompt)

    doc = Document(page_content=generated_response, metadata={"original_query": query})
    
    return doc

def create_document_hypo_docs_list(query_list: List[str], path_to_save: str) -> List[Document]:
    documents_generated: List[Document] = []
    export_data = []

    for idx, q in enumerate(tqdm(query_list, desc="Generating hypothetical documents")):
        doc = create_document_hypo_docs(q)
        documents_generated.append(doc)

        export_data.append({
            "query_id": idx + 1,
            "hypo_docs_doc": {
                "page_content": doc.page_content,
                "metadata": doc.metadata
            }
        })

    output_path = f"../retrieval_inputs/hypo_documents/{path_to_save}.json"
    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(export_data, file, ensure_ascii=False, indent=2)

    return documents_generated

In [None]:
def retrieve_dense(vector_db, query, top_k):
    
    return [doc for doc, _ in vector_db.similarity_search_with_relevance_scores(query, k=top_k)]


def retrieve_sparse(doc_source_path, query, top_k, backend="BM25"):
    
    chunked_docs = load_documents_for_sparse_retrieval(doc_source_path)
    retriever_cls = BM25Retriever if backend == "BM25" else TFIDFRetriever
    retriever = retriever_cls.from_documents(chunked_docs)
    retriever.k = top_k
    return retriever.get_relevant_documents(query)

def retrieve_hybrid_mmr(doc_source_path, vector_db, user_input, final_k, backend="BM25"):
    
    sparse_docs = load_documents_for_sparse_retrieval(doc_source_path)
    mmr_dense = vector_db.max_marginal_relevance_search(query=user_input, k=25, fetch_k=50)

    sparse_cls = BM25Retriever if backend == "BM25" else TFIDFRetriever
    sparse = sparse_cls.from_documents(sparse_docs)
    sparse.k = 25
    bm_result = sparse.get_relevant_documents(user_input)

    return EnsembleRetriever(retrievers=[vector_db.as_retriever(), sparse]).weighted_reciprocal_rank(
        [mmr_dense, bm_result]
    )[:final_k]


def retrieve_reranked_hybrid(doc_source_path, vector_db, query, rerank_limit, final_k, backend="BM25", dense_weight=0.5):
    
    raw_docs = load_documents_for_sparse_retrieval(doc_source_path)
    sparse_cls = BM25Retriever if backend == "BM25" else TFIDFRetriever
    sparse_ret = sparse_cls.from_documents(raw_docs)
    sparse_ret.k = rerank_limit

    dense_ret = vector_db.as_retriever(search_kwargs={"k": rerank_limit}, search_type="similarity")
    hybrid = EnsembleRetriever(retrievers=[sparse_ret, dense_ret], weights=[dense_weight, 1 - dense_weight])
    return hybrid.get_relevant_documents(query)[:final_k]


def retrieve_multi_query(vector_db, user_input, top_k, sub_queries):
    
    sub_queries = sub_queries or create_multi_query(user_input, 3)
    retriever_list = [vector_db.as_retriever(search_kwargs={"k": top_k}) for _ in sub_queries]
    results_all = [r.get_relevant_documents(q) for r, q in zip(retriever_list, sub_queries)]
    return EnsembleRetriever(retrievers=retriever_list).weighted_reciprocal_rank(results_all)[:top_k]


def retrieve_hybrid_multi_query(doc_source_path, vector_db, user_input, top_k, sub_queries):
    
    sparse_docs = load_documents_for_sparse_retrieval(doc_source_path)
    query_variants = sub_queries or create_multi_query(user_input, 3)
    result_pool = []
    retriever_pool = []

    for q in query_variants:
        dense_ret = vector_db.as_retriever(search_kwargs={"k": top_k})
        retriever_pool.append(dense_ret)
        result_pool.append(dense_ret.get_relevant_documents(q))

        bm25_ret = BM25Retriever.from_documents(sparse_docs)
        retriever_pool.append(bm25_ret)
        bm25_ret.k = top_k
        result_pool.append(bm25_ret.get_relevant_documents(q))

    return EnsembleRetriever(retrievers=retriever_pool).weighted_reciprocal_rank(result_pool)[:top_k]


def retrieve_hypo_docs(vector_db, hypo_docs_input_doc, user_input, top_k):
    
    if not hypo_docs_input_doc:
        hypo_docs_input_doc = create_document_hypo_docs(user_input)
    return vector_db.similarity_search(hypo_docs_input_doc.page_content, k=top_k)


def retrieve_hybrid_hypo_docs(doc_source_path, vector_db, hypo_docs_input_doc, user_input, top_k):
    
    if not hypo_docs_input_doc:
        hypo_docs_input_doc = create_document_hypo_docs(user_input)
    all_docs = load_documents_for_sparse_retrieval(doc_source_path)

    sparse = BM25Retriever.from_documents(all_docs)
    sparse.k = top_k
    dense = vector_db.as_retriever(search_kwargs={"k": top_k})

    hybrid = EnsembleRetriever(retrievers=[sparse, dense], weights=[0.5, 0.5])
    return hybrid.get_relevant_documents(hypo_docs_input_doc.page_content)[:top_k]




In [None]:
def retrieve_documents(vector_db, chunk_documents_path: str, strategy: str, top_k: int, user_query: str,
                       score_cutoff: int = 0.75, rerank_limit: int = 50, dense_ratio: float = 0.5,
                       hypo_doc: Document = None, multi_queries: List[str] = []):

    translated_query = translate_query_to_german_if_needed(user_query)

    retrieval_dispatcher = { 
                            "TF-IDF": lambda: retrieve_sparse(doc_source_path=chunk_documents_path, query=translated_query, top_k=top_k, backend="TF-IDF"), 
                            "BM25": lambda: retrieve_sparse(doc_source_path=chunk_documents_path, query=translated_query, top_k=top_k, backend="BM25"), 
                            "Dense": lambda: retrieve_dense(vector_db, query=translated_query, top_k=top_k), 
                            "MMR": lambda: vector_db.max_marginal_relevance_search(query=user_query, k=top_k, fetch_k=50),                             
                            "Hypo_Docs": lambda: retrieve_hypo_docs(vector_db=vector_db, hypo_docs_input_doc=hypo_doc, user_input=user_query, top_k=top_k), 
                            "Multiple_Queries": lambda: retrieve_multi_query(vector_db=vector_db, user_input=user_query, top_k=top_k, sub_queries=multi_queries),                            
                            "Hybrid_BM25_Dense": lambda: retrieve_reranked_hybrid(doc_source_path=chunk_documents_path, vector_db=vector_db, query=translated_query, rerank_limit=rerank_limit, final_k=top_k, backend="BM25", dense_weight=dense_ratio), 
                            "Hybrid_TF-IDF_Dense": lambda: retrieve_reranked_hybrid(doc_source_path=chunk_documents_path, vector_db=vector_db, query=translated_query, rerank_limit=rerank_limit, final_k=top_k, backend="TF-IDF", dense_weight=dense_ratio), 
                            "Hybrid_MMR_BM25": lambda: retrieve_hybrid_mmr(doc_source_path=chunk_documents_path, vector_db=vector_db, user_input=user_query, final_k=top_k, backend="BM25"),
                            "Hybrid_Hypo_Docs": lambda: retrieve_hybrid_hypo_docs(doc_source_path=chunk_documents_path, vector_db=vector_db, hypo_docs_input_doc=hypo_doc, user_input=user_query, top_k=top_k), 
                            "Hybrid_Multiple_Queries": lambda: retrieve_hybrid_multi_query(doc_source_path=chunk_documents_path, vector_db=vector_db, user_input=user_query, top_k=top_k, sub_queries=multi_queries), 
                            }

    if strategy not in retrieval_dispatcher:
        raise ValueError(f"Retrieval strategy '{strategy}' not supported.")

    return retrieval_dispatcher[strategy]()


In [None]:
def rag_pipeline_retrieval_method(query, database, chunk_documents_path, k, model_name:str="gpt-4o-mini", retrieval_method: str="Dense", hypo_document: Document=None, multiple_queries: List[str]=[]):
    
    results = retrieve_documents(vector_db=database, chunk_documents_path=chunk_documents_path, strategy=retrieval_method, top_k=k, 
                                 user_query=query, hypo_doc=hypo_document, multi_queries=multiple_queries)

    response = generate_answer(results, query, model_name)
    
    sources = [doc.metadata.get("source") for doc in results]
    retrieved_chunk_contexts = [doc.page_content for doc in results]
    retrieved_chunk_ids = [doc.metadata.get("chunk_id") for doc in results]
    retrieved_chunk_indices = [doc.metadata.get("chunk_index") for doc in results]

    return response, sources, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices

In [None]:
def enrich_eval_dataset_with_rag_responses_for_optimal_retrieval(eval_dataset, chroma_path, chunk_documents_path, k, model_name, retrieval_method, hypo_document, multiple_queries, optimization="3_optimal_retrieval_method/"):
    
    db = load_vector_database(chroma_path)

    with open(eval_dataset, "r", encoding="utf-8") as f:
        eval_dataset_json = json.load(f)

    enriched_dataset = []
    
    for i, entry in enumerate(tqdm(eval_dataset_json, desc="Processing RAG responses")):
        query = entry["query"]
        
        hypo_doc = hypo_document[i] if retrieval_method in ["Hypo_Docs", "Hybrid_Hypo_Docs"] and hypo_document else None
        multi_query = multiple_queries[i] if retrieval_method in ["Multi_Query", "Hybrid_Multi_Query"] and multiple_queries else None

        response, _, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = rag_pipeline_retrieval_method(query=query, 
                                                                                                                              database=db, 
                                                                                                                              chunk_documents_path=chunk_documents_path, 
                                                                                                                              k=k, 
                                                                                                                              model_name=model_name, 
                                                                                                                              retrieval_method=retrieval_method,
                                                                                                                              hypo_document=hypo_doc,
                                                                                                                              multiple_queries=multi_query)

        entry["generated_response"] = response
        entry["retrieved_chunk_contexts"] = retrieved_chunk_contexts
        entry["retrieved_chunk_ids"] = retrieved_chunk_ids
        entry["retrieved_chunk_indices"] = retrieved_chunk_indices

        enriched_dataset.append(entry)

    output_path = f"eval_datasets/{optimization}{eval_dataset.split('/')[-1].replace('.json', '')}_{retrieval_method}.json"

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(enriched_dataset, f, indent=2, ensure_ascii=False)
        
    return output_path

### 4. Evaluations

#### 4.1 Preparing the Evaluation Dataset and Documents for Multi Query and Hypothetical Documents

Since no new Chroma DB had to be created, the evaluation data set from the RAG baseline can also be reused. The data set was copied and renamed to ensure completeness.

In [None]:
eval_dataset_optimal_retrieval_method = "eval_datasets/3_optimal_retrieval_method/artificial_evaluation_dataset_for_chroma_chunksize1024_overlap128_c800ccc6_optimal_retrieval_method.json"

In [None]:
def extract_queries_from_eval_dataset(json_path: str) -> List[str]:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    queries = [entry["query"] for entry in data]
    return queries

In [None]:
### Create multi query list ###

queries = extract_queries_from_eval_dataset(eval_dataset_optimal_retrieval_method)

path_to_save = "multi_query_list_1024_128_optimal_retrieval.json"
multi_queries = create_multi_query_list(queries, 3, path_to_save)

In [None]:
print(multi_queries)

In [None]:
### Create multiple hypothetical documents ###

path_to_save = "hypothetical_documents_1024_128_optimal_retrieval.json"

hypo_documents = create_document_hypo_docs_list(queries, path_to_save)

In [None]:
print(hypo_documents)

#### 4.2 Enrich Evaluation Datasets with Responses

In [None]:
retrieval_methods = ["TF-IDF", "BM25", "Dense", "MMR", "Hypo_Docs", "Multiple_Queries", "Hybrid_TF-IDF_Dense", "Hybrid_BM25_Dense", "Hybrid_MMR_BM25", "Hybrid_Hypo_Docs", "Hybrid_Multiple_Queries"] 

enriched_datasets = {}

for method in retrieval_methods:
    
    print(f"Enriching evaluation dataset for {method} retriever:")
    
    enriched = enrich_eval_dataset_with_rag_responses_for_optimal_retrieval(
        eval_dataset=eval_dataset_optimal_retrieval_method,
        chroma_path=chroma_db_optimal_retrieval_method,
        chunk_documents_path="1815_documents_for_sparse_retrieval_1024_128_default_baseline.json",  
        k=6,
        model_name="gpt-4o-mini",
        retrieval_method=method,
        hypo_document=hypo_documents,
        multiple_queries=multi_queries
    )

    enriched_datasets[method] = enriched


#### 4.2. Evaluate Retrieval & Generation

In [None]:
retrieval_methods = ["TF-IDF", "BM25", "Dense", "MMR", "Hypo_Docs", "Multi_Queries", "Hybrid_TF-IDF_Dense", "Hybrid_BM25_Dense", "Hybrid_MMR_BM25", "Hybrid_Hypo_Docs", "Hybrid_Multi_Queries"]  


In [None]:
evaluation_results_optimal_chunking = {}
generation_results_optimal_chunking = {}

db_name = chroma_db_optimal_retrieval_method.split("/")[-1]

for index, method in enumerate(retrieval_methods):
    
    json_filename = f"3_optimal_retrieval_method/{enriched_datasets[method].split('/')[-1]}"
    model_name = f"optimal_retrieval_{index+1}_{method}_{db_name.replace('_optimal_retrieval', '')}"

    print(f"\nEvaluating {model_name} using dataset {json_filename}...")

    retrieval_result = run_retrieval_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )

    generation_result = run_generation_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )

    evaluation_results_optimal_chunking[model_name] = retrieval_result
    generation_results_optimal_chunking[model_name] = generation_result

In [None]:
import pandas as pd
import glob
import os

folder_path = "eval_results/3_optimal_retrieval_method"
pattern_retrieval = os.path.join(folder_path, "optimal_retrieval*retrieval_evaluation.csv")
pattern_generation = os.path.join(folder_path, "optimal_retrieval*generation_evaluation.csv")
csv_retrieval_files = glob.glob(pattern_retrieval)
csv_generation_files = glob.glob(pattern_generation)

print(f"Länge Retrieval Files: {len(csv_retrieval_files)}")
print(f"Länge Generation Files: {len(csv_generation_files)}")


df_retrieval = []
df_generation = []

for f in csv_retrieval_files:
    df = pd.read_csv(f)
    df_retrieval.append(df)
    
for f in csv_generation_files:
    df = pd.read_csv(f)
    df_generation.append(df)

combined_df_retrieval = pd.concat(df_retrieval, ignore_index=True)
combined_df_generation = pd.concat(df_generation, ignore_index=True)

# Speichern
output_path_retrieval = os.path.join(folder_path, "combined_optimal_retriever_retrieval_evaluation.csv")
output_path_generation = os.path.join(folder_path, "combined_optimal_retriever_generation_evaluation.csv")

combined_df_retrieval.to_csv(output_path_retrieval, index=False)
combined_df_generation.to_csv(output_path_generation, index=False)

print(f"✅ Done! Retrieval: {output_path_retrieval}\n✅ Generation: {output_path_generation}")