# Implementation, Testing and Evaluation of Fully Optimized RAG - Hybrid (Vector + Graph) Approach

#### Notebook Outline
1. Imports and Configurations
2. Creation of Vector Database
3. Querying the Vector Database
4. Output of optimized RAG Pipelines
5. Evaluations

### 1. Imports and Configurations

Imports

In [None]:

# === Standard Library ===
import csv
import json
import os
import re
import shutil
import sys
import uuid
from urllib.request import urlopen
import time
from collections import Counter
import threading
import random

# === Third-Party Libraries ===
import numpy as np
import tiktoken
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm, trange
from typing import List, Union, Tuple, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
from neo4j import GraphDatabase


# === LangChain Core ===
from langchain.docstore.document import Document
from langchain.schema import Document  # (Optional: doppelt zu obigem)
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import TFIDFRetriever, EnsembleRetriever

# === LangChain Community Integrationen ===
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

# === OpenAI / LangChain OpenAI ===
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from openai import OpenAI

# === Lokale Projektmodule ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from ipynb_notebooks.baseline.rag_utils.baseline_rag import (
    clean_text,
    save_documents_for_sparse_retrieval,
    load_documents_for_sparse_retrieval,
    save_to_chroma, 
    translate_query_to_german_if_needed,
    load_vector_database,
    generate_answer
)

from ipynb_notebooks.single_stage_enhancements.rankGPT_rerank import rankgpt_rerank

from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation
from ipynb_notebooks.evaluation_datasets.retrieval_eval.eval_vector_dataset_generator import generate_evalset
from ipynb_notebooks.evaluation_datasets.retrieval_eval.retrieval_metrics import run_retrieval_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.llm_as_a_judge import run_llm_judge_parallel, run_llm_rejudge_parallel, calculate_and_visualize_scores_of_evaluation_scheme

Configurations

In [None]:
# Move up one level from the Jupyter Notebook directory
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# Construct the path to .env.neo4j in the base directory
env_path = os.path.join(BASE_DIR, ".env.neo4j_aura")

# Load environment variables from .env and .env.neo4j files
load_dotenv()
load_dotenv(env_path, override=True)

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI(api_key=openai.api_key)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Configure Neo4j Aura database connection
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))


# Define constants for paths
DATA_PATH = "../../data/laws_and_ordinances.json"  # Directory containing the url to the law and ordinance documents
DATA_PATH_SHORT_VERSION = "../../data/laws_and_ordinances_short_version.json" # Directory containing a subset of all urls for testing purposes
CHROMA_PATH = "chroma_dbs/chroma"  # Directory to save the Chroma vector store

Helper Functions

In [None]:
def load_paragraph_documents(datapath: str):
    # Load JSON file
    with open(datapath, "r", encoding="utf-8") as file:
        data = json.load(file)

    documents = []
    chunk_index = 1  # Track chunk index globally

    for category in ["laws", "ordinances"]:
        entries = data.get(category, [])
        for entry in tqdm(entries, desc=f"→ Processing {category}"):
            title = entry.get("title", "Unknown Title")
            paragraphs = entry.get("paragraphs", [])

            for para in tqdm(paragraphs, desc=f"  ↳ Paragraphs in '{title}'", leave=False):
                para_url = para.get("paragraph_url", "")
                para_name = para.get("paragraph_name", "Unknown Paragraph")

                if para_url:
                    try:
                        # Load content from paragraph URL
                        loader = WebBaseLoader(para_url)
                        docs = loader.load()

                        for doc in docs:
                            raw_content = doc.page_content
                            cleaned_content = clean_text(raw_content)
                            doc.page_content = cleaned_content

                            doc.metadata.update({
                                "law_title": title,
                                "category": category,
                                "paragraph_id": para.get("paragraph_ID"),
                                "paragraph_name": para_name,
                                "paragraph_url": para_url,
                                "chunk_id": str(uuid.uuid4()),
                                "chunk_index": chunk_index,
                            })

                            documents.append(doc)
                            chunk_index += 1

                    except Exception as e:
                        print(f"Error loading paragraph from URL {para_url}: {e}")
                else:
                    print(f"No paragraph URL found for {title}")

    if not documents:
        raise ValueError("No paragraph documents could be loaded from the input.")

    print(f"Successfully loaded {len(documents)} paragraph-level documents.")
    return documents

### 2. Creation of Vector Database with Paragraph-Wise Chunks

In [None]:
MAX_PARALLEL = 10
SLEEP_BETWEEN_CALLS = 1
lock = threading.Lock()


def sanitize_relation(relation: str) -> str:
    umlaut_map = {"Ä": "AE", "Ö": "OE", "Ü": "UE", "ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss"}
    for umlaut, replacement in umlaut_map.items():
        relation = relation.replace(umlaut, replacement)
    relation = relation.strip()
    relation = relation.replace("§", "PARAGRAPH_").replace(" ", "_").replace("-", "_").upper()
    relation = re.sub(r"[^A-Z0-9_]", "", relation)
    return relation or "UNDEFINED_RELATION"


def extract_relations_from_chunk(text: str) -> list[dict]:
    import openai  # wichtig: OpenAI-Client installiert & konfiguriert
    system_prompt = (
        "Du bist ein KI-System für juristische Wissensmodellierung. "
        "Extrahiere alle relevanten Entitäten und ihre Beziehungen aus folgendem Gesetzestext. "
        "Gib das Ergebnis als reine JSON-Liste zurück:\n"
        "[{\"head\": \"...\", \"relation\": \"...\", \"tail\": \"...\"}]"
    )
    user_prompt = f"Text:\n\"\"\"\n{text}\n\"\"\""

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[{"role": "system", "content": system_prompt},
                  {"role": "user", "content": user_prompt}]
    )

    raw = response.choices[0].message.content
    try:
        json_block = re.search(r"\[\s*{.*?}\s*\]", raw, re.DOTALL)
        return json.loads(json_block.group()) if json_block else []
    except Exception as e:
        print(f"Parsing Error: {e}\nAnswer: {raw}")
        return []
    

# Retry wrapper for write transactions with backoff in case of deadlocks
def safe_write_transaction(driver, func, retries=3, *args, **kwargs):
    for attempt in range(retries):
        try:
            with driver.session() as session:
                session.write_transaction(func, *args, **kwargs)
            return
        except Exception as e:
            if "DeadlockDetected" in str(e) and attempt < retries - 1:
                print(f"🔁 Retry due to deadlock (attempt {attempt + 1})")
                time.sleep(random.uniform(1.0, 2.0))  # backoff
            else:
                print(f"❌ Write transaction failed: {e}")
                return

# Processes a single chunk (document fragment)
def process_single_chunk(i, doc, driver):
    chunk_id = doc.metadata["chunk_id"]
    chunk_index = doc.metadata["chunk_index"]
    title = doc.metadata.get("title", "UnknownLaw")
    source = doc.metadata.get("source", "unknown")
    text = doc.page_content

    # 1. Create or merge the Law node
    def merge_law(tx):
        tx.run("MERGE (l:Law {title: $title})", {"title": title})
    safe_write_transaction(driver, merge_law)

    # 2. Create or update the Chunk node
    def merge_chunk(tx):
        tx.run("""
            MERGE (c:Chunk {chunk_id: $chunk_id})
            SET c.text = $text, c.chunk_index = $chunk_index, c.title = $title, c.source = $source
        """, {
            "chunk_id": chunk_id,
            "text": text,
            "chunk_index": chunk_index,
            "title": title,
            "source": source
        })
    safe_write_transaction(driver, merge_chunk)

    # 3. Create the HAS_CHUNK relationship
    def link_law_chunk(tx):
        tx.run("""
            MATCH (l:Law {title: $title}), (c:Chunk {chunk_id: $chunk_id})
            MERGE (l)-[:HAS_CHUNK]->(c)
        """, {"title": title, "chunk_id": chunk_id})
    safe_write_transaction(driver, link_law_chunk)

    # 4. Extract relations from the chunk text and insert into graph
    try:
        relations = extract_relations_from_chunk(text)
        for rel in relations:
            head = rel["head"]
            tail = rel["tail"]
            rel_type = sanitize_relation(rel["relation"])

            def merge_relation(tx):
                tx.run(f"""
                    MERGE (h:Entity {{id: $head}})
                    MERGE (t:Entity {{id: $tail}})
                    MERGE (h)-[:{rel_type}]->(t)
                    WITH h, t
                    MATCH (c:Chunk {{chunk_id: $chunk_id}})
                    MERGE (c)-[:HAS_ENTITY]->(h)
                    MERGE (c)-[:HAS_ENTITY]->(t)
                """, {
                    "head": head,
                    "tail": tail,
                    "chunk_id": chunk_id
                })

            safe_write_transaction(driver, merge_relation)
            time.sleep(SLEEP_BETWEEN_CALLS)

    except Exception as e:
        print(f"❌ Error in chunk {chunk_id[:6]}: {e}")

# Main ingest function
def ingest_chunks_to_neo4j(chunks: list):
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    with ThreadPoolExecutor(max_workers=MAX_PARALLEL) as executor:
        futures = [
            executor.submit(process_single_chunk, i, doc, driver)
            for i, doc in enumerate(chunks)
        ]
        for _ in tqdm(as_completed(futures), total=len(futures), desc="Neo4j Ingest"):
            pass

    driver.close()

In [None]:
def generate_synchronized_databases(datapath, chunk_size: str = "paragraph_wise_chunking", chunk_overlap: str = "no_overlap", optimization: str = "fully_optimized_rag_pipeline_hybrid", baseline=False):

    documents = load_paragraph_documents(datapath)
    save_documents_for_sparse_retrieval(documents, chunk_size, chunk_overlap, optimization, baseline)

    print("Storing in Chroma ...")
    chroma_path = save_to_chroma(documents, chunk_size, chunk_overlap, baseline, optimization)

    print("Ingest in Neo4j ...")
    ingest_chunks_to_neo4j(documents)

    print("Both databases were successfully synchronized.")
    return chroma_path

### 3. Querying of Hybird Vector + Graph Databases

In [None]:
def filter_documents(
    results: Union[List[Document], List[Tuple[Document, float]]],
    query: str,
    score_threshold: float = 0.25
) -> List[Document]:
    """
    Filters documents based on relevance scores or cosine similarity using normalized scores.

    Args:
        results: List of Documents or (Document, Score) tuples.
        query: The search query string.
        score_threshold: Normalized similarity threshold between 0 and 1.

    Returns:
        List of Documents that pass the similarity threshold.
    """
    filtered_docs = []

    # Case 1: Scores are already provided
    if results and isinstance(results[0], tuple):
        scores = [score for _, score in results]
        min_score, max_score = min(scores), max(scores)

        for doc, score in results:
            # Normalize score
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    else:
        docs = results
        doc_texts = [doc.page_content for doc in docs]

        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

        # Embed query and documents
        query_vec = embedding_model.embed_query(text=query)
        doc_vecs = embedding_model.embed_documents(texts=doc_texts)

        # Compute similarity
        similarity_scores = cosine_similarity([query_vec], doc_vecs)[0]

        # Normalize
        min_score, max_score = similarity_scores.min(), similarity_scores.max()

        for doc, score in zip(docs, similarity_scores):
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    return filtered_docs


In [None]:
def retrieve_documents(query_text, vectordb, chunk_documents_path: str, k: int, rerank_k: int = 50, dense_percent: float = 0.5, thresh_hold: float = 0.25):
    if len(vectordb) == 0:
        return [], "No documents available in the database."

    query_de = translate_query_to_german_if_needed(query_text)
    documents = []  
    
    # Hybrid-Retriever Method Dense + TF-IDF with EnsembleRetriever
    documents = load_documents_for_sparse_retrieval(chunk_documents_path)

    tf_idf_retriever = TFIDFRetriever.from_documents(documents)
    tf_idf_retriever.k = rerank_k
    sparse_retriever = tf_idf_retriever

    dense_retriever = vectordb.as_retriever(search_kwargs={"k": rerank_k}, search_type="similarity")

    ensemble_retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=[dense_percent, 1 - dense_percent]
    )

    result_documents = ensemble_retriever.get_relevant_documents(query_de)
    result_documents = result_documents[:k]
    
    # Post-Retrieval Optimization: Filtering Documents upon Relevancy
    filtered_documents = filter_documents(results=result_documents, query=query_text, score_threshold=thresh_hold)
    
    # Post-Retrieval Optimization: Reranking Retrieved Documents with RankGPT
    reranked_documents = rankgpt_rerank(query_text, filtered_documents, model_name="gpt-4o-mini", window_size=4, step=1)
    
    return reranked_documents

In [None]:
def rewrite_query(original_query, last_response):
    return f"{original_query}. Hinweis: Beachte bei der Beantwortung auch: {last_response}"

In [None]:
def compute_cosine_similarity_with_embeddings(text1, text2, model="text-embedding-3-small"):
    """
    Computes cosine similarity between OpenAI embeddings of two texts.
    """
    embeddings = openai.embeddings.create(
        model=model,
        input=[text1, text2]
    )
    
    vec1 = np.array(embeddings.data[0].embedding)
    vec2 = np.array(embeddings.data[1].embedding)
    
    return float(cosine_similarity([vec1], [vec2])[0][0])

In [None]:
def select_most_consistent_answer(answers):
    embeddings = embedding_model.embed_documents(texts=answers)
    similarity_matrix = cosine_similarity(embeddings)
    avg_sim = similarity_matrix.mean(axis=1)
    best_index = int(np.argmax(avg_sim))
    return answers[best_index], avg_sim, similarity_matrix

In [None]:
def plot_answer_similarity_heatmap(
    answers: list[str],
    labels: list[str] = None,
    similarity_matrix: Optional[np.ndarray] = None,
    avg_similarities: Optional[np.ndarray] = None,
    title: str = "LLM Answer Similarity"
):

    if similarity_matrix is None:
        from sentence_transformers import SentenceTransformer
        from sklearn.metrics.pairwise import cosine_similarity
        model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = model.encode(answers)
        similarity_matrix = cosine_similarity(embeddings)

    if labels is None:
        labels = [f"Model {i+1}" for i in range(len(answers))]

    if avg_similarities is not None:
        annotated_labels = [
            f"{name}\nØ={avg:.4f}" for name, avg in zip(labels, avg_similarities)
        ]
    else:
        annotated_labels = labels

    # Plot
    plt.figure(figsize=(9, 7))
    sns.heatmap(
        similarity_matrix,
        annot=True,
        fmt=".4f",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=annotated_labels,
        square=True,
        cbar=True
    )
    plt.title(title)
    plt.xlabel("LLM Answer")
    plt.ylabel("LLM Answer")
    plt.tight_layout()
    plt.show()

    return similarity_matrix

In [None]:
def run_graph_query(chunk_id: str, driver):
    query = """
    MATCH (c:Chunk {chunk_id: $chunk_id})-[:HAS_ENTITY]->(e1:Entity)
    OPTIONAL MATCH (e1)-[r1]->(e2:Entity)
    OPTIONAL MATCH (e2)-[r2]->(e3:Entity)
    WITH c, 
         collect({head: e1.id, rel: type(r1), tail: e2.id}) +
         collect({head: e2.id, rel: type(r2), tail: e3.id}) AS relations
    UNWIND relations AS relmap
    WITH c, relmap.head AS head, relmap.rel AS relation, relmap.tail AS tail
    WHERE head IS NOT NULL AND relation IS NOT NULL AND tail IS NOT NULL
    RETURN
        c.chunk_id AS chunk_id,
        c.chunk_index AS chunk_index,
        c.title AS law_title,
        head,
        relation,
        tail
    """

    try:
        with driver.session() as session:
            result = session.run(query, {"chunk_id": chunk_id})
            return [
                {
                    "chunk_id": record["chunk_id"],
                    "chunk_index": record["chunk_index"],
                    "law_title": record["law_title"],
                    "context": f'{record["head"]} - {record["relation"]} -> {record["tail"]}'
                }
                for record in result
            ]
    except Exception as e:
        print(f"❌ Error in chunk {chunk_id[:6]}: {e}")
        return []


In [None]:
def graph_retriever_from_chunks(chunk_ids: List[str], top_k: int = 20) -> dict:

    retrieved_nodes = []
    
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(run_graph_query, cid, driver) for cid in chunk_ids]
        for f in as_completed(futures):
            retrieved_nodes.extend(f.result())

    driver.close()

    # Ranking nach Häufigkeit
    index_counts = Counter(entry["chunk_index"] for entry in retrieved_nodes)
    top_indices = set(idx for idx, _ in index_counts.most_common(top_k))
    filtered_nodes = [entry for entry in retrieved_nodes if entry["chunk_index"] in top_indices]

    # Tokenbegrenzung
    contexts = list(dict.fromkeys(entry["context"] for entry in filtered_nodes))
    enc = tiktoken.encoding_for_model("gpt-4o-mini")
    MAX_TOKENS = 3000
    final_contexts = []
    token_count = 0

    for ctx in contexts:
        tokens = len(enc.encode(ctx))
        if token_count + tokens > MAX_TOKENS:
            break
        final_contexts.append(ctx)
        token_count += tokens

    chunk_indices = [entry["chunk_index"] for entry in filtered_nodes]
    law_titles = list(set(entry["law_title"] for entry in filtered_nodes))

    return {
        "prompt_context": final_contexts,
        "retrieved_chunk_indices": list(set(chunk_indices)),
        "retrieved_law_titles": law_titles
    }

In [None]:
def fully_optimized_rag_pipeline_hybrid(
    query,
    database,
    chunk_documents_path: str,
    k=5,
    model_name="gpt-4o-mini",
    n_consistency=5,
    temperature=0.7,
    heatmap=False
):
    retrieved_contexts = []
    retrieved_sources = []
    retrieved_ids = []
    retrieved_indices = []
    retrieved_ids_set = set()

    # Vector retrieval
    vector_results = retrieve_documents(
        query_text=query,
        vectordb=database,
        chunk_documents_path=chunk_documents_path,
        k=k
    )

    for doc in vector_results:
        chunk_id = doc.metadata.get("chunk_id")
        if chunk_id not in retrieved_ids_set:
            retrieved_contexts.append(doc.page_content)
            retrieved_sources.append(doc.metadata.get("source"))
            retrieved_ids.append(chunk_id)
            retrieved_indices.append(doc.metadata.get("chunk_index"))
            retrieved_ids_set.add(chunk_id)
            
            
    # Graph retrieval based on accumulated chunk IDs
    graph_results = graph_retriever_from_chunks(chunk_ids=retrieved_ids, top_k=20)

    # Combine both vector and graph contexts (remove duplicates)
    vector_contexts = [doc.page_content for doc in vector_results]
    graph_contexts = graph_results.get("prompt_context", [])
    combined_context = list(dict.fromkeys(graph_contexts + vector_contexts))

    # Token-based context trimming
    enc = tiktoken.encoding_for_model(model_name)
    MAX_TOKENS = 10000
    final_contexts = []
    token_count = 0
    for ctx in combined_context:
        tokens = len(enc.encode(ctx))
        if token_count + tokens > MAX_TOKENS:
            break
        final_contexts.append(ctx)
        token_count += tokens

    # Self-consistency generation
    answers = []
    for i in range(n_consistency):
        try:
            answer = generate_answer(final_contexts, query, model_name, temperature)
            answers.append(answer)
        except Exception as e:
            print(f"Error at answer no. {i+1}: {e}")
            answers.append("")
        time.sleep(0.5)

    response, similarities, similarity_matrix = select_most_consistent_answer(answers)

    if heatmap:
        print("\n--- Self-Consistency Answers ---")
        for i, a in enumerate(answers):
            print(f"[{i+1}] ({similarities[i]:.4f}): {a}")
        plot_answer_similarity_heatmap(
            answers=answers,
            labels=[f"Ans {i+1}" for i in range(n_consistency)],
            similarity_matrix=similarity_matrix,
            avg_similarities=similarities
        )

    return response.strip(), retrieved_sources, final_contexts, retrieved_ids, retrieved_indices


### 4. Output of Baseline RAG Model

In [None]:
datapath = "../../data/laws_and_ordinances.json"

chroma_path_fully_optimized_rag_pipeline_hybrid = generate_synchronized_databases(datapath)

In [None]:
query = "Welchen Anwendungsbereich umfasst §1 des Elektromobilitätsgesetz - EmoG?"
database = load_vector_database(chroma_path=chroma_path_fully_optimized_rag_pipeline_hybrid)
chunk_documents_path="2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_hybrid.json"
model_name = "gpt-4o-mini"  # or any other supported model

response, sources, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = fully_optimized_rag_pipeline_hybrid(query=query, 
                                                                                                                                database=database,
                                                                                                                                chunk_documents_path=chunk_documents_path,
                                                                                                                                model_name=model_name)

In [None]:
# Display the results
print(f"Query: {query} \n")
print(f"Response: {response} \n")
print(f"Sources: {sources} \n")
print(f"Retrieved Chunk Contexts: {retrieved_chunk_contexts} \n")
print(f"Retrieved Chunk Ids: {retrieved_chunk_ids} \n")

### 4. Evaluations

#### Generate Evaluation Dataset

In [None]:
eval_dataset = generate_evalset(chroma_db=chroma_path_fully_optimized_rag_pipeline_hybrid, test_set_size=50, 
                 query_distribution={"single": 0.6, "multi_specific": 0.2, "multi_intra_document": 0.2})

#### Enrich Evaluation Dataset

In [None]:
def enrich_eval_dataset_with_fully_optimized_rag_responses_hybrid(eval_dataset, chroma_path, chunk_documents_path, model_name="gpt-4o-mini"):
    
    db = load_vector_database(chroma_path)

    with open(eval_dataset, "r", encoding="utf-8") as f:
        eval_dataset_json = json.load(f)

    enriched_dataset = []
    
    for entry in tqdm(eval_dataset_json, desc="Processing RAG responses"):
        query = entry["query"]

        # Run fully optimized RAG pipeline
        response, _, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = fully_optimized_rag_pipeline_hybrid(query=query, 
                                                                                                                                database=db,
                                                                                                                                chunk_documents_path=chunk_documents_path,
                                                                                                                                model_name=model_name)

        # Add new fields to file
        entry["generated_response"] = response
        entry["retrieved_chunk_contexts"] = retrieved_chunk_contexts
        entry["retrieved_chunk_ids"] = retrieved_chunk_ids
        entry["retrieved_chunk_indices"] = retrieved_chunk_indices

        enriched_dataset.append(entry)

    output_path = f"{eval_dataset.replace('.json', '')}_rag_enriched.json"
    # Store results as new json file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(enriched_dataset, f, indent=2, ensure_ascii=False)
        
    return output_path

In [None]:
chunk_documents_path = "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_hybrid.json"

enriched_evalset = enrich_eval_dataset_with_fully_optimized_rag_responses_hybrid(eval_dataset=eval_dataset, 
                                       chroma_path = chroma_path_fully_optimized_rag_pipeline_hybrid, 
                                       chunk_documents_path=chunk_documents_path,
                                       model_name="gpt-4o-mini")

#### Evaluate RAG Retrieval

In [None]:
model_name="fully_optimized_rag_hybrid"

retrieval_result = run_retrieval_evaluation(json_filename=enriched_evalset.split("/")[-1], model_name=model_name, evaluation_mode="final_eval")
display(retrieval_result)

#### Evaluate RAG Generation

In [None]:
generation_results = run_generation_evaluation(json_filename=enriched_evalset.split("/")[-1], model_name=model_name, evaluation_mode="final_eval") 
display(generation_results)

#### Evaluate RAG Generation on Golden Evaluation Dataset

In [None]:
print(chroma_path_fully_optimized_rag_pipeline_hybrid)

In [None]:
golden_dataset = "eval_datasets/golden_qa_evalset_generation.json"
chunk_documents_path = "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_hybrid.json"

enriched_golden_evalset = enrich_eval_dataset_with_fully_optimized_rag_responses_hybrid(eval_dataset=golden_dataset, 
                                       chroma_path = chroma_path_fully_optimized_rag_pipeline_hybrid, 
                                       chunk_documents_path=chunk_documents_path,
                                       model_name="gpt-4o-mini")

In [None]:
generation_results_golden_dataset = run_generation_evaluation(json_filename=enriched_golden_evalset.split("/")[-1], 
                                                              model_name=model_name, 
                                                              evaluation_mode="final_eval") 
display(generation_results_golden_dataset)

#### LLM-as-a-Judge for Comparison 

In [None]:
input_path = "eval_datasets/golden_qa_evalset_generation_hybrid_rag_enriched.json"
first_output_path = "eval_results/golden_qa_evalset_optimized_hybrid_rag_llm_as_a_judge_first_results.json"
final_rejudge_output_path = "eval_results/golden_qa_evalset_optimized_hybrid_rag_llm_as_a_judge_final_rejudge_results.json"
max_workers = 10

In [None]:
# LLM-as-a-Judge for Comparison and Further Justification

llm_as_a_judge_first_eval_results_path = run_llm_judge_parallel(input_path=input_path, output_path=first_output_path, max_workers=max_workers)
llm_as_a_judge_rejudge_results_path = run_llm_rejudge_parallel(input_path=llm_as_a_judge_first_eval_results_path, output_path=final_rejudge_output_path, max_workers=max_workers)

In [None]:
output_file_name_LLMaaJ_first = "1b_hybrid_llm_as_a_judge_first_results"
output_file_name_LLMaaJ_rejudge = "1b_hybrid_llm_as_a_judge_rejudge_results"

llm_as_a_judge_first_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_first_eval_results_path, output_file_name_LLMaaJ_first)
llm_as_a_judge_final_rejudge_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_rejudge_results_path, output_file_name_LLMaaJ_rejudge)