# Implementation, Testing and Evaluation for Optimal Chunking in RAG

#### Notebook Outline
1. Imports and Configurations
2. Creation of Vector Database
3. Querying the Vector Database
4. Output of RAG Experiments
5. Evaluations

This notebook uses functions from the Baseline RAG .ipynb file and adapts these.

### 1. Imports and Configurations

Imports

In [None]:
# === Standard Library Imports ===
import os
import sys
import re
import uuid
import json
from urllib.request import urlopen

# === Scientific and Utility Libraries ===
from tqdm import tqdm


# === Environment Management ===
from dotenv import load_dotenv

# === OpenAI and Tokenization ===
import openai

# === LangChain Community Integrations ===
from langchain_community.document_loaders import WebBaseLoader

# === Project Root Configuration ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# === Custom Project Modules ===
from ipynb_notebooks.baseline.rag_utils.baseline_rag import (
    generate_data_store,
    enrich_eval_dataset_with_rag_responses,
    clean_text,
    save_to_chroma,
    save_documents_for_sparse_retrieval
)

from ipynb_notebooks.evaluation_datasets.retrieval_eval.eval_vector_dataset_generator import generate_evalset
from ipynb_notebooks.evaluation_datasets.retrieval_eval.retrieval_metrics import run_retrieval_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation


Configurations

In [None]:
# Load environment variables. Assumes that the project directory contains a .env file with API keys
load_dotenv()

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']

# Define constants for paths
DATA_PATH = "../../data/laws_and_ordinances.json"  # Directory containing the url to the law and ordinance documents
DATA_PATH_SHORT_VERSION = "../../data/laws_and_ordinances_short_version.json" # Directory containing a subset of all urls for testing purposes
CHROMA_PATH = "chroma_dbs/chroma"  # Directory to save the Chroma vector store

### 2. Creation of Vector Databases

In [None]:
# Generate chroma vector databases with different chunk sizes and overlaps
chunk_configurations = [
    (128, 0),
    (128, 16),
    (256, 0),
    (256, 32),
    (512, 0),
    (512, 64),
    (1024, 0),
    (1024, 128),  
    (2048, 0),
    (2048, 256)
]

chroma_dbs = {}

for size, overlap in chunk_configurations:
    key = f"chroma_{size}_{overlap}"
    chroma_dbs[key] = generate_data_store(
        datapath="../../data/laws_and_ordinances.json",
        chunk_size=size,
        chunk_overlap=overlap,
        optimization="optimal_chunking"
    )
    


#### 2.1 Special Case of Dynamic Chunking

Here, the creation of a chroma vector database is adapted so that the legal texts and regulations are not split according to fixed chunk sizes, but the entire paragraphs of a law are vectorized. The aim is to investigate whether the correctness and factual accuracy of the model can be improved.

In [None]:
def load_paragraph_documents(datapath: str):
    # Load JSON file
    with open(datapath, "r", encoding="utf-8") as file:
        data = json.load(file)

    documents = []
    chunk_index = 1  # Track chunk index globally

    for category in ["laws", "ordinances"]:
        entries = data.get(category, [])
        for entry in tqdm(entries, desc=f"→ Processing {category}"):
            title = entry.get("title", "Unknown Title")
            paragraphs = entry.get("paragraphs", [])

            for para in tqdm(paragraphs, desc=f"  ↳ Paragraphs in '{title}'", leave=False):
                para_url = para.get("paragraph_url", "")
                para_name = para.get("paragraph_name", "Unknown Paragraph")

                if para_url:
                    try:
                        # Load content from paragraph URL
                        loader = WebBaseLoader(para_url)
                        docs = loader.load()

                        for doc in docs:
                            raw_content = doc.page_content
                            cleaned_content = clean_text(raw_content)
                            doc.page_content = cleaned_content

                            doc.metadata.update({
                                "law_title": title,
                                "category": category,
                                "paragraph_id": para.get("paragraph_ID"),
                                "paragraph_name": para_name,
                                "paragraph_url": para_url,
                                "chunk_id": str(uuid.uuid4()),
                                "chunk_index": chunk_index,
                            })

                            documents.append(doc)
                            chunk_index += 1

                    except Exception as e:
                        print(f"Error loading paragraph from URL {para_url}: {e}")
                else:
                    print(f"No paragraph URL found for {title}")

    if not documents:
        raise ValueError("No paragraph documents could be loaded from the input.")

    print(f"Successfully loaded {len(documents)} paragraph-level documents.")
    return documents


In [None]:
def generate_data_store_from_paragraphs(datapath: str, chunk_size: str = "paragraph_wise_chunking", chunk_overlap: str = "no_overlap", baseline: bool = False, optimization: str = "default"):
    documents = load_paragraph_documents(datapath)
    save_documents_for_sparse_retrieval(documents, chunk_size, chunk_overlap, optimization, baseline)
    chroma_path = save_to_chroma(documents, chunk_size="_dynamic", chunk_overlap=0, baseline=baseline, optimization=optimization)
    return chroma_path

In [None]:
chroma_dbs["chroma_paragraph_wise_chunking"] = generate_data_store_from_paragraphs(
        datapath="../../data/laws_and_ordinances.json",
        optimization="optimal_chunking"
    )

### 3. Evaluations

#### 3.1 Generate Evaluation Datasets

In [None]:
eval_sets = {}

for key, db in chroma_dbs.items():
    print(f"Erzeuge Testset für {key}...")
    eval_sets[key] = generate_evalset(
        chroma_db=db,
        test_set_size=50,
        optimization="1_optimal_chunking/",
        query_distribution={
            "single": 0.6,
            "multi_specific": 0.2,
            "multi_intra_document": 0.2
        }
    )


#### 3.2 Enrich Evaluation Datasets with Responses

In [None]:
enriched_eval_datasets = {}

for key, chroma_path in chroma_dbs.items():
    
    print(f"Enriching dataset '{eval_sets[key]}' using Chroma index: {key}...")

    enriched_eval_datasets[key] = enrich_eval_dataset_with_rag_responses(
        eval_dataset=eval_sets[key],
        chroma_path=chroma_path,
        model_name="gpt-4o-mini"
    )

#### 3.3. Evaluate Retrieval & Generation

In [None]:
evaluation_results_optimal_chunking = {}
generation_results_optimal_chunking = {}

for key, chroma_path in chroma_dbs.items():
    db_name = chroma_path.split("/")[-1]
    
    json_filename = f"1_optimal_chunking/artificial_evaluation_dataset_for_{db_name}_rag_enriched.json"
    model_name = f"optimal_chunking_rag_{key.split('_')[1]}_{key.split('_')[2]}_{db_name.split('_')[3]}"
    
    print(f"Evaluating {model_name} using dataset {json_filename}...")
    
    retrieval_result = run_retrieval_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )
    
    generation_result = run_generation_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )
    
    evaluation_results_optimal_chunking[model_name] = retrieval_result
    generation_results_optimal_chunking[model_name] = generation_result



In [None]:
import pandas as pd

# Leere Liste für die Einträge
rows = []

# Alle Modelle iterieren
for model_name in evaluation_results_optimal_chunking.keys():
    # Hol die Retrieval- und Generationsergebnisse
    retrieval = evaluation_results_optimal_chunking[model_name]
    generation = generation_results_optimal_chunking[model_name]

    # Kombiniere beide in ein Dictionary und füge den Modellnamen hinzu
    combined = {
        "model": model_name,
        **retrieval,
        **generation
    }

    # Füge zur Liste hinzu
    rows.append(combined)

# Erstelle DataFrame
results_df = pd.DataFrame(rows)

results_df.to_csv("combined_evaluation_results.csv", index=False)

In [None]:
def extract_chunksize_and_overlap(filename):
    
    basename = os.path.basename(filename)
    # Search for rag_<chunksize>_<overlap>
    match = re.search(r'rag_(\d+)_(\d+)', basename)
    if match:
        chunksize = int(match.group(1))
        overlap = int(match.group(2))
        return (chunksize, overlap)
    else:
        return (float('inf'), float('inf'))

In [None]:
import pandas as pd
import glob
import os

folder_path = "eval_results/1_optimal_chunking"
pattern_retrieval = os.path.join(folder_path, "optimal_chunking*retrieval_evaluation.csv")
pattern_generation = os.path.join(folder_path, "optimal_chunking*generation_evaluation.csv")
csv_retrieval_files = glob.glob(pattern_retrieval)
csv_generation_files = glob.glob(pattern_generation)

# Sort files
csv_retrieval_files_sorted = sorted(csv_retrieval_files, key=extract_chunksize_and_overlap)
csv_generation_files_sorted = sorted(csv_generation_files, key=extract_chunksize_and_overlap)

#for f in csv_files_sorted:
#    chunksize, overlap = extract_chunksize_and_overlap(f)

df_retrieval = []
df_generation = []

for f in csv_retrieval_files_sorted:
    df = pd.read_csv(f)
    chunksize, overlap = extract_chunksize_and_overlap(f)
    df_retrieval.append(df)
    
for f in csv_generation_files_sorted:
    df = pd.read_csv(f)
    chunksize, overlap = extract_chunksize_and_overlap(f)
    df_generation.append(df)

combined_df_retrieval = pd.concat(df_retrieval, ignore_index=True)
combined_df_generation = pd.concat(df_generation, ignore_index=True)

output_path_retrieval = os.path.join(folder_path, "combined_optimal_chunking_retrieval_evaluation.csv")
combined_df_retrieval.to_csv(output_path_retrieval, index=False)
output_path_generation = os.path.join(folder_path, "combined_optimal_chunking_generation_evaluation.csv")
combined_df_generation.to_csv(output_path_generation, index=False)

print(f"Done! Retrieval stored in {output_path_retrieval} & Generation Evaluation in {output_path_generation}")

In [None]:
generation_results_optimal_chunking