# Implementation, Testing and Evaluation of Fully Optimized RAG - Vector Approach

#### Notebook Outline
1. Imports and Configurations
2. Creation of Vector Database
3. Querying the Vector Database
4. Output of Optimized RAG Pipelines
5. Evaluations

### 1. Imports and Configurations

Imports

In [1]:
# === Standard Library ===
import csv
import json
import os
import re
import shutil
import sys
import uuid
from urllib.request import urlopen
import time
import concurrent.futures

# === Third-Party Libraries ===
import numpy as np
import tiktoken
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm, trange
from typing import List, Union, Tuple, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed


# === LangChain Core ===
from langchain.docstore.document import Document
from langchain.schema import Document  # (Optional: doppelt zu obigem)
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import BM25Retriever, TFIDFRetriever, EnsembleRetriever

# === LangChain Community Integrationen ===
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

# === OpenAI / LangChain OpenAI ===
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from openai import OpenAI

# === Lokale Projektmodule ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from ipynb_notebooks.baseline.rag_utils.baseline_rag import (
    clean_text,
    save_documents_for_sparse_retrieval,
    load_documents_for_sparse_retrieval,
    save_to_chroma, 
    translate_query_to_german_if_needed,
    detect_language_name,
    load_vector_database,
    generate_answer
)

from ipynb_notebooks.single_stage_enhancements.rankGPT_rerank import rankgpt_rerank

from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation
from ipynb_notebooks.evaluation_datasets.retrieval_eval.eval_vector_dataset_generator import generate_evalset
from ipynb_notebooks.evaluation_datasets.retrieval_eval.retrieval_metrics import run_retrieval_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.llm_as_a_judge import run_llm_judge_parallel, run_llm_rejudge_parallel, calculate_and_visualize_scores_of_evaluation_scheme


  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jonas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jonas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


Configurations

In [None]:
# Load environment variables. Assumes that the project directory contains a .env file with API keys
load_dotenv()

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI(api_key=openai.api_key)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Define constants for paths
DATA_PATH = "../../data/laws_and_ordinances.json"  # Directory containing the url to the law and ordinance documents
DATA_PATH_SHORT_VERSION = "../../data/laws_and_ordinances_short_version.json" # Directory containing a subset of all urls for testing purposes
CHROMA_PATH = "chroma_dbs/chroma"  # Directory to save the Chroma vector store

Helper Functions

In [None]:
def load_paragraph_documents(datapath: str):
    # Load JSON file
    with open(datapath, "r", encoding="utf-8") as file:
        data = json.load(file)

    documents = []
    chunk_index = 1  # Track chunk index globally

    for category in ["laws", "ordinances"]:
        entries = data.get(category, [])
        for entry in tqdm(entries, desc=f"→ Processing {category}"):
            title = entry.get("title", "Unknown Title")
            paragraphs = entry.get("paragraphs", [])

            for para in tqdm(paragraphs, desc=f"  ↳ Paragraphs in '{title}'", leave=False):
                para_url = para.get("paragraph_url", "")
                para_name = para.get("paragraph_name", "Unknown Paragraph")

                if para_url:
                    try:
                        # Load content from paragraph URL
                        loader = WebBaseLoader(para_url)
                        docs = loader.load()

                        for doc in docs:
                            raw_content = doc.page_content
                            cleaned_content = clean_text(raw_content)
                            doc.page_content = cleaned_content

                            doc.metadata.update({
                                "law_title": title,
                                "category": category,
                                "paragraph_id": para.get("paragraph_ID"),
                                "paragraph_name": para_name,
                                "paragraph_url": para_url,
                                "chunk_id": str(uuid.uuid4()),
                                "chunk_index": chunk_index,
                            })

                            documents.append(doc)
                            chunk_index += 1

                    except Exception as e:
                        print(f"Error loading paragraph from URL {para_url}: {e}")
                else:
                    print(f"No paragraph URL found for {title}")

    if not documents:
        raise ValueError("No paragraph documents could be loaded from the input.")

    print(f"Successfully loaded {len(documents)} paragraph-level documents.")
    return documents

### 2. Creation of Vector Database with Paragraph-Wise Chunks

In [None]:
def generate_data_store_from_paragraphs(datapath: str, chunk_size: str = "paragraph_wise_chunking", chunk_overlap: str = "no_overlap", baseline: bool = False, optimization: str = "fully_optimized_rag_pipeline_vector"):
    documents = load_paragraph_documents(datapath)
    save_documents_for_sparse_retrieval(documents, chunk_size, chunk_overlap, optimization, baseline)
    chroma_path = save_to_chroma(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap, baseline=baseline, optimization=optimization)
    return chroma_path

### 3. Querying of Vector Database

In [None]:
def filter_documents(
    results: Union[List[Document], List[Tuple[Document, float]]],
    query: str,
    score_threshold: float = 0.25
) -> List[Document]:
    """
    Filters documents based on relevance scores or cosine similarity using normalized scores.

    Args:
        results: List of Documents or (Document, Score) tuples.
        query: The search query string.
        score_threshold: Normalized similarity threshold between 0 and 1.

    Returns:
        List of Documents that pass the similarity threshold.
    """
    filtered_docs = []

    # Case 1: Scores are already provided
    if results and isinstance(results[0], tuple):
        scores = [score for _, score in results]
        min_score, max_score = min(scores), max(scores)

        for doc, score in results:
            # Normalize score
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    else:
        docs = results
        doc_texts = [doc.page_content for doc in docs]

        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

        # Embed query and documents
        query_vec = embedding_model.embed_query(text=query)
        doc_vecs = embedding_model.embed_documents(texts=doc_texts)

        # Compute similarity
        similarity_scores = cosine_similarity([query_vec], doc_vecs)[0]

        # Normalize
        min_score, max_score = similarity_scores.min(), similarity_scores.max()

        for doc, score in zip(docs, similarity_scores):
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    return filtered_docs


In [None]:
def retrieve_documents(query_text, vectordb, chunk_documents_path: str, k: int, thresh_hold: float = 0.25):
    if len(vectordb) == 0:
        return [], "No documents available in the database."

    query_de = translate_query_to_german_if_needed(query_text)
    documents = []  
    
    # Hybrid-Retriever Method Dense + TF-IDF with EnsembleRetriever
    documents = load_documents_for_sparse_retrieval(chunk_documents_path)       

    vectordb_retriever = vectordb.as_retriever(search_kwargs={"k": 25}, search_type="similarity")
    result_documents_MRR = vectordb.max_marginal_relevance_search(query=query_de, k=25, fetch_k=50)
    
    bm25_retriever = BM25Retriever.from_documents(documents)
    bm25_retriever.k = 25
    result_documents_BM25 = bm25_retriever.get_relevant_documents(query_de)

    result_documents = []
    result_documents.append(result_documents_MRR)
    result_documents.append(result_documents_BM25)

    ensemble_retriever = EnsembleRetriever(retrievers=[vectordb_retriever, bm25_retriever], weights=[0.5, 0.5])
    result_documents = ensemble_retriever.weighted_reciprocal_rank(result_documents)
    result_documents = result_documents[:k]
    
    # Post-Retrieval Optimization: Filtering Documents upon Relevancy
    filtered_documents = filter_documents(results=result_documents, query=query_de, score_threshold=thresh_hold)
    
    # Post-Retrieval Optimization: Reranking Retrieved Documents with RankGPT
    reranked_documents = rankgpt_rerank(query_de, filtered_documents, model_name="gpt-4o-mini", window_size=4, step=1)
    
    return reranked_documents

In [None]:
def select_most_consistent_answer(answers):
    embeddings = embedding_model.embed_documents(texts=answers)
    similarity_matrix = cosine_similarity(embeddings)
    avg_sim = similarity_matrix.mean(axis=1)
    best_index = int(np.argmax(avg_sim))
    return answers[best_index], avg_sim, similarity_matrix

In [None]:
def plot_answer_similarity_heatmap(
    answers: list[str],
    labels: list[str] = None,
    similarity_matrix: Optional[np.ndarray] = None,
    avg_similarities: Optional[np.ndarray] = None,
    title: str = "LLM Answer Similarity"
):

    if similarity_matrix is None:
        from sentence_transformers import SentenceTransformer
        from sklearn.metrics.pairwise import cosine_similarity
        model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = model.encode(answers)
        similarity_matrix = cosine_similarity(embeddings)

    if labels is None:
        labels = [f"Model {i+1}" for i in range(len(answers))]

    if avg_similarities is not None:
        annotated_labels = [
            f"{name}\nØ={avg:.4f}" for name, avg in zip(labels, avg_similarities)
        ]
    else:
        annotated_labels = labels

    # Plot
    plt.figure(figsize=(9, 7))
    sns.heatmap(
        similarity_matrix,
        annot=True,
        fmt=".4f",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=annotated_labels,
        square=True,
        cbar=True
    )
    plt.title(title)
    plt.xlabel("LLM Answer")
    plt.ylabel("LLM Answer")
    plt.tight_layout()
    plt.show()

    return similarity_matrix

In [None]:
def fully_optimized_rag_pipeline_vector(
    query,
    database,
    chunk_documents_path: str,
    k=6,
    model_name="gpt-4o-mini",
    n_consistency=5,
    temperature=0.7,
    heatmap=False
):
    # Initializing
    retrieved_contexts = []
    retrieved_sources = []
    retrieved_ids = []
    retrieved_indices = []
    retrieved_ids_set = set()

    # Retrieve documents from vector
    results = retrieve_documents(
        query_text=query,
        vectordb=database,
        chunk_documents_path=chunk_documents_path,
        k=k
    )
    

    for doc in results:
        chunk_id = doc.metadata.get("chunk_id")
        if chunk_id not in retrieved_ids_set:
            retrieved_contexts.append(doc.page_content)
            retrieved_sources.append(doc.metadata.get("source"))
            retrieved_ids.append(chunk_id)
            retrieved_indices.append(doc.metadata.get("chunk_index"))
            retrieved_ids_set.add(chunk_id)
            

    # shorten retrieved context to 3 docs due to high Recall, MMR & MAP, so that few irrelevant noise is added to the answer generation
    shortened_results = [doc for doc in results[:3]]

    # Self-Consistency Generation    
    answers = []
    for i in range(n_consistency):
        try:
            answer = generate_answer(results=shortened_results, query_text=query, model_name=model_name, temperature=temperature)
            answers.append(answer)
        except Exception as e:
            print(f"Error at answer no. {i+1}: {e}")
            answers.append("")
        time.sleep(0.5)

    response, similarities, similarity_matrix = select_most_consistent_answer(answers)

    if heatmap:
        print("\n--- Self-Consistency Antworten ---")
        for i, a in enumerate(answers):
            print(f"[{i+1}] ({similarities[i]:.4f}): {a}")
        plot_answer_similarity_heatmap(
            answers=answers,
            labels=[f"Ans {i+1}" for i in range(n_consistency)],
            similarity_matrix=similarity_matrix,
            avg_similarities=similarities
        )

    return response.strip(), retrieved_sources, retrieved_contexts, retrieved_ids, retrieved_indices


### 4. Output of Baseline RAG Model

In [None]:
chroma_path_fully_optimized_rag_pipeline_vector = generate_data_store_from_paragraphs(datapath="../../data/laws_and_ordinances.json")

print(chroma_path_fully_optimized_rag_pipeline_vector)

In [None]:
query = "Welchen Anwendungsbereich umfasst §1 des Elektromobilitätsgesetz - EmoG?"
chroma_path_fully_optimized_rag_pipeline_vector = "../chroma_dbs/chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector"
database = load_vector_database(chroma_path=chroma_path_fully_optimized_rag_pipeline_vector)
chunk_documents_path= "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_vector.json"
model_name = "gpt-4o-mini"  # or any other supported model

response, sources, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = fully_optimized_rag_pipeline_vector(query=query, 
                                                                                                                                database=database,
                                                                                                                                chunk_documents_path=chunk_documents_path,
                                                                                                                                model_name=model_name,
                                                                                                                                heatmap=True)

In [None]:
# Display the results
print(f"Query: {query} \n")
print(f"Response: {response} \n")
print(f"Sources: {sources} \n")
print(f"Retrieved Chunk Contexts: {retrieved_chunk_contexts} \n")
print(f"Retrieved Chunk Ids: {retrieved_chunk_ids} \n")

### 4. Evaluations

#### Generate Evaluation Dataset

In [None]:
eval_dataset = generate_evalset(chroma_db=chroma_path_fully_optimized_rag_pipeline_vector, test_set_size=50, 
                 query_distribution={"single": 0.6, "multi_specific": 0.2, "multi_intra_document": 0.2})

#### Enrich Evaluation Dataset

In [None]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

def process_single_entry(entry, db, chunk_documents_path, model_name):
    query = entry["query"]
    
    try:
        response, _, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = fully_optimized_rag_pipeline_vector(
            query=query,
            database=db,
            chunk_documents_path=chunk_documents_path,
            model_name=model_name
        )

        entry["generated_response"] = response
        entry["retrieved_chunk_contexts"] = retrieved_chunk_contexts
        entry["retrieved_chunk_ids"] = retrieved_chunk_ids
        entry["retrieved_chunk_indices"] = retrieved_chunk_indices

    except Exception as e:
        entry["generated_response"] = f"Fehler: {str(e)}"
        entry["retrieved_chunk_contexts"] = []
        entry["retrieved_chunk_ids"] = []
        entry["retrieved_chunk_indices"] = []

    # Optional: Delay between requests to stay under rate limits
    time.sleep(1.2)
    return entry

def enrich_eval_dataset_with_fully_optimized_rag_responses_vector_parallel(
    eval_dataset,
    chroma_path,
    chunk_documents_path,
    model_name="gpt-4o-mini",
    max_workers=20
):
    db = load_vector_database(chroma_path)

    with open(eval_dataset, "r", encoding="utf-8") as f:
        eval_dataset_json = json.load(f)

    enriched_dataset = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_single_entry, entry, db, chunk_documents_path, model_name): entry
            for entry in eval_dataset_json
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing RAG responses (parallel)"):
            try:
                result = future.result()
                if result:  # Skip None if any errors are handled in process_single_entry
                    enriched_dataset.append(result)
            except Exception as e:
                entry = futures[future]
                print(f"Fehler bei query_id={entry.get('query_id')}: {e}")

    # Sort the dataset by query_id
    sorted_enriched_dataset = sorted(enriched_dataset, key=lambda x: x.get("query_id", 0))

    output_path = f"{eval_dataset.replace('.json', '')}_rag_enriched.json"

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(sorted_enriched_dataset, f, indent=2, ensure_ascii=False)

    return output_path

In [None]:
eval_dataset = "eval_datasets/artificial_evaluation_dataset_for_chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector.json"
chroma_path_fully_optimized_rag_pipeline_vector = "../chroma_dbs/chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector"
chunk_documents_path = "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_vector.json"

enriched_evalset = enrich_eval_dataset_with_fully_optimized_rag_responses_vector_parallel(eval_dataset=eval_dataset, 
                                       chroma_path = chroma_path_fully_optimized_rag_pipeline_vector, 
                                       chunk_documents_path=chunk_documents_path,
                                       model_name="gpt-4o-mini")

#### Evaluate RAG Retrieval

In [None]:
eval_dataset = "eval_datasets/artificial_evaluation_dataset_for_chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector.json"
# enriched_evalset = "eval_datasets/artificial_evaluation_dataset_for_chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector_rag_enriched_saved2.json"
chroma_path_fully_optimized_rag_pipeline_vector = "../chroma_dbs/chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector"
chunk_documents_path = "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_vector.json"
model_name="fully_optimized_rag_pipeline_vector"

retrieval_result = run_retrieval_evaluation(json_filename=enriched_evalset.split("/")[-1], 
                                            model_name=model_name,
                                            evaluation_mode="final_eval"
                                            )
display(retrieval_result)

#### Evaluate RAG Generation

In [None]:
generation_results = run_generation_evaluation(json_filename=enriched_evalset.split("/")[-1], 
                                               model_name=model_name, 
                                               evaluation_mode="final_eval"
                                               ) 
display(generation_results)

In [None]:
print(enriched_evalset)

#### Evaluate RAG Generation on Golden Evaluation Dataset

In [None]:
golden_dataset = "eval_datasets/golden_qa_evalset_generation.json"
chroma_path_fully_optimized_rag_pipeline_vector = "../chroma_dbs/chroma_chunksizeparagraph_wise_chunking_overlapno_overlap_56f329f9_fully_optimized_rag_pipeline_vector"
chunk_documents_path = "2960_documents_for_sparse_retrieval_paragraph_wise_chunking_no_overlap_fully_optimized_rag_pipeline_vector.json"


enriched_golden_evalset = enrich_eval_dataset_with_fully_optimized_rag_responses_vector_parallel(eval_dataset=golden_dataset, 
                                       chroma_path = chroma_path_fully_optimized_rag_pipeline_vector, 
                                       chunk_documents_path=chunk_documents_path,
                                       model_name="gpt-4o-mini",
                                       max_workers=20)

In [None]:
print(enriched_golden_evalset)

In [None]:
model_name="fully_optimized_rag_pipeline_vector_golden_qa_set"

generation_results_golden_dataset = run_generation_evaluation(json_filename=enriched_golden_evalset.split("/")[-1], 
                                                              model_name=model_name, 
                                                              evaluation_mode="final_eval"
                                                              ) 
display(generation_results_golden_dataset)

#### Manual Evaluation and LLM-as-a-Judge for Comparison 

In [None]:
!streamlit run ../evaluation_datasets/generation_eval/manual_eval.py "eval_datasets/golden_qa_evalset_generation_vector_rag_enriched.json"


In [None]:
input_path = "eval_datasets/golden_qa_evalset_generation_vector_rag_enriched.json"
first_output_path = "eval_results/golden_qa_evalset_optimized_vector_rag_llm_as_a_judge_first_results.json"
final_rejudge_output_path = "eval_results/golden_qa_evalset_optimized_vector_rag_llm_as_a_judge_final_rejudge_results.json"
max_workers = 10

In [None]:
# LLM-as-a-Judge for Comparison and Further Justification

llm_as_a_judge_first_eval_results_path = run_llm_judge_parallel(input_path=input_path, output_path=first_output_path, max_workers=max_workers)
llm_as_a_judge_rejudge_results_path = run_llm_rejudge_parallel(input_path=llm_as_a_judge_first_eval_results_path, output_path=final_rejudge_output_path, max_workers=max_workers)

In [None]:
manual_results_path = "golden_qa_evalset_optimized_vector_rag_manual_results"
output_file_name_manual = "1a_vector_manual_results"
output_file_name_LLMaaJ_first = "1a_vector_llm_as_a_judge_first_results"
output_file_name_LLMaaJ_rejudge = "1a_vector_llm_as_a_judge_rejudge_results"

eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(manual_results_path, output_file_name_manual)
llm_as_a_judge_first_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_first_eval_results_path, output_file_name_LLMaaJ_first)
llm_as_a_judge_final_rejudge_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_rejudge_results_path, output_file_name_LLMaaJ_rejudge)