# Implementation, Testing and Evaluation for Optimal Post-Retrieval in RAG

#### Notebook Outline
1. Imports and Configurations
2. Creation of Vector Database
3. Querying the Vector Database
4. Output of RAG Experiments
5. Evaluations

This notebook uses functions from the Baseline RAG .ipynb file and adapts these.

### 1. Imports and Configurations

Imports

In [None]:
# === Standard Library Imports ===
import json
import os
import sys
import time

# === Scientific and Utility Libraries ===
import torch
from tqdm import tqdm
from dotenv import load_dotenv
from typing import List, Union, Tuple

# === Language Detection and Text Similarity ===
from sklearn.metrics.pairwise import cosine_similarity

# === Transformers and Sentence Embeddings ===
from transformers import AutoTokenizer, AutoModel, T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import CrossEncoder
from FlagEmbedding import FlagReranker

# === OpenAI Integrations ===
import openai
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

# === LangChain Core Components ===
from langchain.schema import Document
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.retrievers.document_compressors import CohereRerank

# === Project Root Configuration ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# === Custom Project Modules ===
from ipynb_notebooks.single_stage_enhancements.rankGPT_rerank import rankgpt_rerank

from ipynb_notebooks.baseline.rag_utils.baseline_rag import (
    load_vector_database,
    retrieve_documents,
    generate_answer,
    translate_query_to_german_if_needed,
    detect_language_name
)

from ipynb_notebooks.evaluation_datasets.retrieval_eval.eval_vector_dataset_generator import generate_evalset
from ipynb_notebooks.evaluation_datasets.retrieval_eval.retrieval_metrics import run_retrieval_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation

Configurations

In [None]:
# Load environment variables. Assumes that the project directory contains a .env file with API keys
load_dotenv()

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI(api_key=openai.api_key)

cohere_api_key = os.getenv("COHERE_API_KEY")


# Cohere Client

# Define constants for paths
DATA_PATH = "../../data/laws_and_ordinances.json"  # Directory containing the url to the law and ordinance documents
DATA_PATH_SHORT_VERSION = "../../data/laws_and_ordinances_short_version.json" # Directory containing a subset of all urls for testing purposes
CHROMA_PATH = "chroma_dbs/chroma"  # Directory to save the Chroma vector store

### 2. Creation of Vector Databases

**Why Creating Separate Chroma Databases for Each Retrieval Process Is Not Necessary**

In contrast to chunking experiments, evaluating different retrieval strategies does not require generating separate Chroma vector databases. This is because all strategies operate over the same underlying document corpus and embeddings. Retrieval processes such as iterative, recursive or adaptive approaches differ only in how they search the embedded documents—not in how the documents are chunked or stored.

As long as the Chroma DB is generated using a consistent chunking strategy and embedding model, it provides a shared semantic space that is sufficient for fair comparison across retrieval methods. Creating separate vector stores per strategy would introduce unnecessary redundancy and would not improve the validity of the evaluation.

In [None]:
chroma_db_optimal_reranking = "../chroma_dbs/chroma_chunksize1024_overlap128_c800ccc6_optimal_reranking"

### 3. Post-Retrieval Optimization: Filtering, Reranking, Summarizing & Fusion

In [None]:
def filter_documents(
    results: Union[List[Document], List[Tuple[Document, float]]],
    query: str,
    score_threshold: float = 0.25
) -> List[Document]:
    """
    Filters documents based on relevance scores or cosine similarity using normalized scores.

    Args:
        results: List of Documents or (Document, Score) tuples.
        query: The search query string.
        score_threshold: Normalized similarity threshold between 0 and 1.

    Returns:
        List of Documents that pass the similarity threshold.
    """
    filtered_docs = []

    # Case 1: Scores are already provided
    if results and isinstance(results[0], tuple):
        scores = [score for _, score in results]
        min_score, max_score = min(scores), max(scores)

        for doc, score in results:
            # Normalize score
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    else:
        docs = results
        doc_texts = [doc.page_content for doc in docs]

        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

        # Embed query and documents
        query_vec = embedding_model.embed_query(text=query)
        doc_vecs = embedding_model.embed_documents(texts=doc_texts)

        # Compute similarity
        similarity_scores = cosine_similarity([query_vec], doc_vecs)[0]

        # Normalize
        min_score, max_score = similarity_scores.min(), similarity_scores.max()

        for doc, score in zip(docs, similarity_scores):
            norm_score = (score - min_score) / (max_score - min_score + 1e-8)
            if norm_score >= score_threshold:
                filtered_docs.append(doc)

    return filtered_docs


In [None]:
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

In [None]:
def print_doc_order(title, docs):
    indices = [doc.metadata.get("chunk_index", "N/A") for doc in docs]
    print(f"\n{title}: {indices}")


def rerank_with_bge(docs, query, top_k, doc_order=False):
    if doc_order:
        print_doc_order("Before Reranking (BGE)", docs)

    inputs = [(doc.page_content, query) for doc in docs]
    reranker = FlagReranker('BAAI/bge-reranker-base')
    scores = reranker.compute_score(inputs, batch_size=32)
    scored_docs = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)
    ranked_docs = [doc for _, doc in scored_docs[:top_k]]

    if doc_order:
        print_doc_order("After Reranking (BGE)", ranked_docs)
    
    return ranked_docs


def rerank_with_cohere(docs, query, top_k, doc_order=False):
    if doc_order:
        print_doc_order("Before Reranking (Cohere)", docs)

    compressor = CohereRerank(top_n=top_k, user_agent="langchain", model="rerank-english-v3.0")
    ranked_docs = compressor.compress_documents(documents=docs, query=query)
    time.sleep(7)

    if doc_order:
        print_doc_order("After Reranking (Cohere)", ranked_docs)
    
    return ranked_docs


def rerank_with_colbert(docs, query, top_k, device, doc_order=False):
    if doc_order:
        print_doc_order("Before Reranking (ColBERT)", docs)

    model_name = "colbert-ir/colbertv2.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device).eval()

    with torch.no_grad():
        query_tokens = tokenizer(query, return_tensors="pt", truncation=True, padding=True).to(device)
        query_embeds = model(**query_tokens).last_hidden_state.squeeze(0)

        scores = []
        for doc in docs:
            doc_tokens = tokenizer(doc.page_content, return_tensors="pt", truncation=True, padding=True).to(device)
            doc_embeds = model(**doc_tokens).last_hidden_state.squeeze(0)
            sim_matrix = torch.matmul(query_embeds, doc_embeds.T)
            max_sim = sim_matrix.max(dim=1).values.sum().item()
            scores.append((max_sim, doc))

    ranked_docs = [doc for score, doc in sorted(scores, key=lambda x: x[0], reverse=True)][:top_k]

    if doc_order:
        print_doc_order("After Reranking (ColBERT)", ranked_docs)
    
    return ranked_docs



def rerank_with_monot5(docs, query, top_k, device, doc_order=False):
    if doc_order:
        print_doc_order("Before Reranking (MonoT5)", docs)

    model_name = "castorini/monot5-base-msmarco"
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    inputs = [f"Query: {query} Document: {doc.page_content} Relevant:" for doc in docs]
    encodings = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(**encodings, max_length=2)

    scores = []
    for output, doc in zip(outputs, docs):
        label = tokenizer.decode(output, skip_special_tokens=True).strip().lower()
        score = 1.0 if label == "true" else 0.0
        scores.append((score, doc))

    ranked_docs = [doc for score, doc in sorted(scores, key=lambda x: x[0], reverse=True)][:top_k]

    if doc_order:
        print_doc_order("After Reranking (MonoT5)", ranked_docs)

    return ranked_docs


def rerank_with_rankgpt(docs, query, model_name, doc_order=False):
    if doc_order:
        print_doc_order("Before Reranking (RankGPT)", docs)

    reranked_docs = rankgpt_rerank(query, docs, model_name=model_name, window_size=4, step=1)

    if doc_order:
        print_doc_order("After Reranking (RankGPT)", reranked_docs)

    return reranked_docs


In [None]:
def rerank_documents(results, query, top_k=5, model_name="gpt-4o-mini", reranker_type="cohere", device="cpu", doc_order=False):
    # Extract documents from results (handle tuple format if necessary)
    docs_only = [doc for doc, _ in results] if results and isinstance(results[0], tuple) else results or []

    # Define the reranker dispatcher
    reranker_dispatcher = {
                            "bge": lambda: rerank_with_bge(docs_only, query, top_k, doc_order),
                            "cohere": lambda: rerank_with_cohere(docs_only, query, top_k, doc_order),
                            "colbert": lambda: rerank_with_colbert(docs_only, query, top_k, device, doc_order),
                            "monot5": lambda: rerank_with_monot5(docs_only, query, top_k, device, doc_order),
                            "LLM_reranker_rankGPT": lambda: rerank_with_rankgpt(docs_only, query, model_name, doc_order)
                            }


    if reranker_type not in reranker_dispatcher:
        raise ValueError(f"Unknown reranker_type '{reranker_type}' provided.")

    return reranker_dispatcher[reranker_type]()  # Execute selected reranker


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.schema import Document
from openai import OpenAI
from typing import List
from tqdm import tqdm

def summarize_single_top_chunk(doc: Document, query: str, model_name: str = "gpt-4o-mini", temperature: float = 0.0) -> str:
    """
    Summarizes a single top chunk with detailed focus in relation to the query.
    Includes law_title or title from metadata.
    """
    client = OpenAI()
    title = doc.metadata.get("law_title") or doc.metadata.get("title", "Unbekannter Titel")

    prompt = f"""
Du bist ein juristischer KI-Assistent. Fasse den folgenden Gesetzesauszug aus dem Dokument "{title}" 
so zusammen, dass alle relevanten Informationen in Bezug auf die Frage enthalten sind.

Frage: {query}

--- Gesetzestext ---
{doc.page_content}
--- Ende Gesetzestext ---

Zusammenfassung (max. 5 Stichpunkte):
"""

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return f"Chunk Titel: {title}\n Zusammenfassung: {response.choices[0].message.content.strip()}"
    except Exception as e:
        print(f"Error summarizing top chunk: {e}")
        return f"TITEL: {title}\nFehler beim Zusammenfassen dieses Chunks."

def summarize_top_chunks_with_query_parallel(
    top_chunks: List[Document],
    query: str,
    model_name: str = "gpt-4o-mini",
    temperature: float = 0.0,
    max_workers: int = 4
) -> List[str]:
    """
    Summarizes each top chunk in parallel and returns list of their summaries with title.
    """
    summaries = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(summarize_single_top_chunk, doc, query, model_name, temperature)
            for doc in top_chunks
        ]

        for future in as_completed(futures):
            try:
                summary = future.result()
                summaries.append(summary)
            except Exception as e:
                print(f"Error during summarization of a top chunk: {e}")
                summaries.append("")

    return summaries


In [None]:
def summarize_support_chunks_combined(
    support_chunks: List[Document],
    query: str,
    model_name: str = "gpt-4o-mini",
    temperature: float = 0.0
) -> str:
    """
    Combines all support chunks and summarizes them with a single LLM call.
    Focuses on extracting only the most relevant information to the query.
    """
    client = OpenAI()

    # Combine context into one string with titles
    combined_text = ""
    for doc in support_chunks:
        title = doc.metadata.get("law_title") or doc.metadata.get("title", "Unbekannter Titel")
        combined_text += f"\nTITEL: {title}\n{doc.page_content}\n"

    prompt = f"""
Du bist ein juristischer KI-Assistent. Extrahiere aus den folgenden Gesetzestexten nur die Informationen,
die direkt relevant für die folgende Frage sind. Die Antwort soll kurz, strukturiert und präzise sein.

Frage: {query}

--- Gesetzestexte ---
{combined_text}
--- Ende Gesetzestexte ---

Antwort (in maximal 10 Stichpunkten):
"""

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing support chunks: {e}")
        return "Fehler beim Zusammenfassen der unterstützenden Chunks."


In [None]:
from typing import List

def summarizing_and_prompt_engineering(
    retrieved_chunks: List[Document],
    query: str,
    model_name: str = "gpt-4o-mini",
    temperature: float = 0.0
) -> str:
    """
    Builds the final prompt using summarizations from top and support chunks.

    Top chunks are summarized in parallel (fokus: Genauigkeit), 
    support chunks in einem gemeinsamen Aufruf (fokus: Extraktion relevanter Infos).
    """
    
    query_de = translate_query_to_german_if_needed(query)
    detected_language = detect_language_name(query)
    
    # split into top chunks and support chunks for summary
    n_top_chunks = 2
    top_chunks = retrieved_chunks[:n_top_chunks]
    support_chunks = retrieved_chunks[n_top_chunks:]

    # Summarize top chunks with query context (parallel)
    top_chunk_summaries = summarize_top_chunks_with_query_parallel(
        top_chunks=top_chunks,
        query=query_de,
        model_name=model_name,
        temperature=temperature
    )

    # Summarize support chunks in a single call
    support_summary = summarize_support_chunks_combined(
        support_chunks=support_chunks,
        query=query_de,
        model_name=model_name,
        temperature=temperature
    )

    # Construct context section
    context_block = "\n\n---\n".join(top_chunk_summaries)
    if support_summary:
        context_block += "\n\n--- Unterstützender Kontext ---\n" + support_summary

    # Final prompt
    prompt_template = f"""
        Du bist ein hilfreicher, juristischer KI-Assistent für Gesetzestexte im deutschen Energie- und Versorgungsbereich. 
        Generiere eine kurze, präzise, konsistente und vollständige Gesamtantwort von max. 200 Tokens basierend auf folgendem Kontext:

        Frage:
        {query}
        ---
        Kontext:
        {context_block}
        ---
        Sprache in der geantwortet werden soll: 
        {detected_language}
        """

    return prompt_template.strip()


In [None]:
def rag_pipeline_post_retrieval(
    query,
    database,
    model_name="gpt-4o-mini",
    filtering: bool = False,
    reranking: bool = False,
    reranker_type: str = "cohere",
    summarizing_prompt_engineering: bool = False,
    k: int = 6,
    thresh_hold: float = 0.75,
    doc_order: bool = False
):
    # Document Retrieval
    retrieved_results = retrieve_documents(query, db=database, k=k)

    # Filtering
    if filtering:
        retrieved_results = filter_documents(results=retrieved_results, query=query, score_threshold=thresh_hold)

    # Reranking
    if reranking:
        retrieved_results = rerank_documents(
            retrieved_results, query, top_k=5, model_name=model_name, reranker_type=reranker_type, doc_order=doc_order
        )

    # Summarization & Prompt Engineering
    if summarizing_prompt_engineering:
        
        if retrieved_results and isinstance(retrieved_results[0], tuple):
            retrieved_results = [doc for doc, _ in retrieved_results]
            
        prompt = summarizing_and_prompt_engineering(
            retrieved_chunks=retrieved_results,
            query=query,
            model_name=model_name,
            temperature=0.2  
        )
        response = generate_answer(results=prompt, query_text=query, model_name=model_name)
    else:
        # Normal generation with plain concatenation of chunks
        response = generate_answer(results=retrieved_results, query_text=query, model_name=model_name)

    # Extract metadata for logging/tracing
    sources = [doc.metadata.get("source") for doc in retrieved_results]
    retrieved_chunk_contexts = [doc.page_content for doc in retrieved_results]
    retrieved_chunk_ids = [doc.metadata.get("chunk_id") for doc in retrieved_results]
    retrieved_chunk_indices = [doc.metadata.get("chunk_index") for doc in retrieved_results]

    return response, sources, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices


### 4. Evaluations

#### 4.1 Preparing the Evaluation Dataset

Since no new Chroma DB had to be created, the evaluation data set from the RAG baseline can also be reused. The data set was copied and renamed to ensure completeness.

In [None]:
eval_dataset_optimal_reranking = "eval_datasets/5_optimal_reranking/artificial_evaluation_dataset_for_chroma_chunksize1024_overlap128_c800ccc6_optimal_reranking.json"

#### 4.2 Enrich Evaluation Datasets with Responses

In [None]:
def enrich_eval_dataset_with_rag_responses_for_optimal_reranking(eval_dataset, chroma_path, config, k, model_name, reranker_type="cohere", optimization="5_optimal_reranking/", thresh_hold=0.75, doc_order=False):
    
    db = load_vector_database(chroma_path)

    with open(eval_dataset, "r", encoding="utf-8") as f:
        eval_dataset_json = json.load(f)

    enriched_dataset = []
    
    for i, entry in enumerate(tqdm(eval_dataset_json, desc="Processing RAG responses")):
        query = entry["query"]

        # Führe die RAG-Pipeline aus
        response, _, retrieved_chunk_contexts, retrieved_chunk_ids, retrieved_chunk_indices = rag_pipeline_post_retrieval(
                query=query,
                database=db,
                model_name=model_name,
                filtering=config["filtering"],
                reranking=config["reranking"],
                reranker_type=reranker_type,  # oder z.B. cohere
                summarizing_prompt_engineering=config["summarizing_prompt_engineering"],
                k=k,
                thresh_hold=thresh_hold,
                doc_order=doc_order
            )

        # Füge neue Felder zur Entry hinzu
        entry["generated_response"] = response
        entry["retrieved_chunk_contexts"] = retrieved_chunk_contexts
        entry["retrieved_chunk_ids"] = retrieved_chunk_ids
        entry["retrieved_chunk_indices"] = retrieved_chunk_indices

        enriched_dataset.append(entry)
        
    reranker_name = ""
    thresh_hold_label = ""
        
    if config["reranking"]:
        reranker_name = reranker_type + "_"
        
    if config["filtering"]:
        thresh_hold_label = "_" + str(thresh_hold)

    output_path = f"eval_datasets/{optimization}{config['label']}{thresh_hold_label}_{reranker_name}rag_enriched.json"
    # Speichere das Ergebnis als neue JSON-Datei
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(enriched_dataset, f, indent=2, ensure_ascii=False)
        
    return output_path

In [None]:
configs = [
            {"filtering": True, "reranking": False, "summarizing_prompt_engineering": False, "label": "filtering"},
            {"filtering": False, "reranking": True, "summarizing_prompt_engineering": False, "label": "reranking"},
            {"filtering": False, "reranking": False, "summarizing_prompt_engineering": True, "label": "summarizing_prompt_engineering"},
            {"filtering": True, "reranking": True, "summarizing_prompt_engineering": True, "label": "all_combined"},
            {"filtering": True, "reranking": True, "summarizing_prompt_engineering": False, "label": "filtering_reranking_combined"},
          ]

enriched_datasets = []

In [None]:
# Filtering 
thresh_holds = [0.25, 0.5, 0.75]

for thresh_hold in thresh_holds: 
    enriched_datasets.append(enrich_eval_dataset_with_rag_responses_for_optimal_reranking(
            eval_dataset=eval_dataset_optimal_reranking,
            chroma_path=chroma_db_optimal_reranking,
            config=configs[0],
            k=15,
            model_name="gpt-4o-mini",
            thresh_hold=thresh_hold
        ))

In [None]:
# Reranking
reranker_types = ["bge", "cohere", "colbert", "monot5", "LLM_reranker_rankGPT"]

for reranker_type in reranker_types: 
    enriched_datasets.append(enrich_eval_dataset_with_rag_responses_for_optimal_reranking(
            eval_dataset=eval_dataset_optimal_reranking,
            chroma_path=chroma_db_optimal_reranking,
            config=configs[1],
            k=15,
            model_name="gpt-4o-mini",
            reranker_type=reranker_type
        ))

In [None]:
# Summarizing
enriched_datasets.append(enrich_eval_dataset_with_rag_responses_for_optimal_reranking(
        eval_dataset=eval_dataset_optimal_reranking,
        chroma_path=chroma_db_optimal_reranking,
        config=configs[2],
        k=6,
        model_name="gpt-4o-mini"
    ))

In [None]:
# All combined
enriched_datasets.append(enrich_eval_dataset_with_rag_responses_for_optimal_reranking(
        eval_dataset=eval_dataset_optimal_reranking,
        chroma_path=chroma_db_optimal_reranking,
        config=configs[3],
        k=15,
        model_name="gpt-4o-mini",
        reranker_type="LLM_reranker_rankGPT",
        thresh_hold=0.25
    ))

In [None]:
# Filtering and Reranking combined
enriched_datasets.append(enrich_eval_dataset_with_rag_responses_for_optimal_reranking(
        eval_dataset=eval_dataset_optimal_reranking,
        chroma_path=chroma_db_optimal_reranking,
        config=configs[4],
        k=15,
        model_name="gpt-4o-mini",
        reranker_type="LLM_reranker_rankGPT",
        thresh_hold=0.25
    ))

#### 4.3. Evaluate Retrieval & Generation

In [None]:
for index, dataset in enumerate(enriched_datasets): 
    json_filename = dataset.split("/")[-1]
    model_name = f"optimal_post_retrieval_reranking_{index+1}_{json_filename.replace('retrieval_eval_dataset_for_', '').replace('_rag_enriched.json', '')}"  
    print(json_filename)
    print(model_name)

In [None]:
evaluation_results_optimal_chunking = {}
generation_results_optimal_chunking = {}

for index, dataset in enumerate(enriched_datasets):
    json_filename = f"5_optimal_reranking/{dataset.split('/')[-1]}"
    model_name = f"optimal_post_retrieval_reranking_{index+1}_{json_filename.replace('5_optimal_reranking/artificial_evaluation_dataset_for_chroma_chunksize1024_overlap128_c800ccc6_optimal_reranking_', '').replace('_rag_enriched.json', '')}"  
    
    print(model_name)

    print(f"\nEvaluating {model_name} using dataset {json_filename}...")

    retrieval_result = run_retrieval_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )

    generation_result = run_generation_evaluation(
        json_filename=json_filename,
        model_name=model_name
    )

    evaluation_results_optimal_chunking[model_name] = retrieval_result
    generation_results_optimal_chunking[model_name] = generation_result

In [None]:
from pathlib import Path
import pandas as pd

# Define base folder and file patterns
folder_path = Path("eval_results") / "5_optimal_reranking"
pattern_retrieval = "optimal_post_retrieval*retrieval_evaluation.csv"
pattern_generation = "optimal_post_retrieval*generation_evaluation.csv"

# Find matching CSV files
csv_retrieval_files = list(folder_path.glob(pattern_retrieval))
csv_generation_files = list(folder_path.glob(pattern_generation))

print(f"🔍 Found {len(csv_retrieval_files)} retrieval files.")
print(f"🔍 Found {len(csv_generation_files)} generation files.")

# Load and combine retrieval evaluation files
df_retrieval = [pd.read_csv(f) for f in csv_retrieval_files]
df_generation = [pd.read_csv(f) for f in csv_generation_files]

# Concatenate if there is at least one file
if df_retrieval:
    combined_df_retrieval = pd.concat(df_retrieval, ignore_index=True)
    output_path_retrieval = folder_path / "combined_optimal_reranking_retrieval_evaluation.csv"
    combined_df_retrieval.to_csv(output_path_retrieval, index=False)
    print(f"✅ Retrieval results saved to: {output_path_retrieval}")
else:
    print("⚠️ No retrieval CSV files found.")

if df_generation:
    combined_df_generation = pd.concat(df_generation, ignore_index=True)
    output_path_generation = folder_path / "combined_optimal_reranking_generation_evaluation.csv"
    combined_df_generation.to_csv(output_path_generation, index=False)
    print(f"✅ Generation results saved to: {output_path_generation}")
else:
    print("⚠️ No generation CSV files found.")
