# Evaluation Data Generation

In [None]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Entfernt Memory-Limit
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import gc
import torch
from tqdm import tqdm
from ragas import evaluate
from ragas.dataset_schema import RagasDataset
from datasets import Dataset
from ragas import evaluate
from packages.person import Person
from packages.globals import EMBEDDINGS
from packages.llm_config import LLMConfig
from packages.evaluate_rag import EvaluationPipeline
from langchain_community.embeddings import OpenAIEmbeddings
from packages.vector_store_handler import VectorStoreHandler
from packages.document_processing import DocumentProcessing
from packages.bm25_retriever import BM25Retriever
from packages.vector_store_handler import HybridRetriever
from packages.init_chain import InitializeQuesionAnsweringChain

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity,
    ResponseRelevancy
)

%load_ext autoreload
%autoreload 2

In [2]:
def cleanup_memory():
    """Gibt GPU/MPS Memory frei"""
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    print("[INFO] Memory cleanup completed")


In [3]:
name = "Washington"
use_full_dataset = True  # True f√ºr 42 Fragen, False f√ºr 7 Testfragen"

In [None]:
llm_config = LLMConfig(temperature=0.0)
llm = llm_config.get_local_llm(use_openai=True)
print("[INFO] LLM loaded once - will be reused for all pipelines")

In [None]:
#Daten laden
df_eval = pd.read_csv(f'./autogen_questions/{name}/hilfreich.csv')

if not use_full_dataset:
    df_eval = df_eval.head(3)

eval_questions = df_eval["question"].tolist()
question_types = df_eval["question_type"].tolist()
print(f"[INFO] Loaded {len(eval_questions)} questions")
print(f"[INFO] Question types: {pd.Series(question_types).value_counts().to_dict()}")
print(f"[INFO] Columns in df_eval: {df_eval.columns.tolist()}")

In [None]:
def run_single_test(name: str, config: dict, questions: list[str], llm=None):
    """
    F√ºhrt einen einzelnen Testlauf f√ºr eine gegebene Konfiguration aus
    und gibt die generierten Antworten mit Metadaten zur√ºck.
    """
    
    person = Person(name=name)
    use_open_ai = False
    if use_open_ai:
        embedding = OpenAIEmbeddings()
    else:
        embedding = EMBEDDINGS
    search_kwargs_num = 3
    
    if llm is None:
        llm_config = LLMConfig(temperature=0.0)
        llm = llm_config.get_local_llm()

    splitter_type = config["splitter_type"]
    chunk_size = config["chunk_size"]
    chunk_overlap = config["chunk_overlap"]
    retrieval_mode = config["retrieval_mode"]
    use_reranker = config["use_reranker"]
    
    vectorstore_handler = VectorStoreHandler(
        embeddings=embedding,
        search_kwargs_num=search_kwargs_num
    )

    database_path = vectorstore_handler._get_vector_store_path(
        vector_store_name=person.name,
        splitter_type=splitter_type,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    print(f"[INFO] FAISS path (expected): {database_path}")

    if not os.path.exists(database_path):
        print(f"Datenbank unter {database_path} nicht gefunden. Erstelle sie neu...")
        doc_processor = DocumentProcessing(embeddings=embedding)
        split_documents = doc_processor.get_chunked_documents(
            directory_path=f"./data/{person.name}",
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
        _, db = vectorstore_handler.create_db_and_retriever(
            chunked_documents=split_documents,
            vector_store_name=person.name,
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    else:
        print(f"Lade existierende Datenbank von {database_path}...")
        _, db = vectorstore_handler.get_db_and_retriever(
            vector_store_name=person.name,
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    if db is None:
        print(f"[ERROR] Database creation/loading failed for {splitter_type}. Skipping this configuration.")
        return []

    if retrieval_mode == 'hybrid':
        print("[INFO] Using HYBRID Retriever for this run.")
        bm25_index_dir = f"./bm25_indexes/{name}_{config['splitter_type']}"
        print(f"[INFO] Using BM25 index: {bm25_index_dir} | exists: {os.path.isdir(bm25_index_dir)}")
        bm25_retriever = BM25Retriever(bm25_index_dir)
        retriever = HybridRetriever(db=db, bm25_retriever=bm25_retriever, k=search_kwargs_num)
    else: # 'dense'
        print("[INFO] Using DENSE Retriever for this run.")
        retriever = db.as_retriever(search_kwargs={"k": search_kwargs_num})

    qa_chain = InitializeQuesionAnsweringChain(
        llm=llm,
        retriever=retriever,
        db=db,
        person=person,
        search_kwargs_num=search_kwargs_num,
        use_reranker=use_reranker
    )

    eval_pipeline = EvaluationPipeline(
        qa_chain=qa_chain,
        eval_questions=questions
    )

    answer_list_with_metadata = eval_pipeline.generate_answers_with_metadata()
    return answer_list_with_metadata

In [None]:
evaluation_matrix = [
    # === DENSE RETRIEVAL ===
    {"pipeline_name": "dense_recursive",  "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense", "use_reranker": False},
    {"pipeline_name": "dense_sentence",   "splitter_type": "sentence_transformer", "chunk_size": 256, "chunk_overlap": 30, "retrieval_mode": "dense", "use_reranker": False},
    {"pipeline_name": "dense_semantic",   "splitter_type": "semantic", "chunk_size": 0, "chunk_overlap": 0, "retrieval_mode": "dense", "use_reranker": False},
    
    # === DENSE + RERANKER ===
    {"pipeline_name": "dense_recursive_rerank", "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense", "use_reranker": True},
    {"pipeline_name": "dense_sentence_rerank",  "splitter_type": "sentence_transformer", "chunk_size": 256, "chunk_overlap": 30, "retrieval_mode": "dense", "use_reranker": True},
    {"pipeline_name": "dense_semantic_rerank",  "splitter_type": "semantic", "chunk_size": 0, "chunk_overlap": 0, "retrieval_mode": "dense", "use_reranker": True},
    
    # === HYBRID RETRIEVAL ===
    {"pipeline_name": "hybrid_recursive", "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": False},
    {"pipeline_name": "hybrid_sentence",  "splitter_type": "sentence_transformer", "chunk_size": 256, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": False},
    {"pipeline_name": "hybrid_semantic",  "splitter_type": "semantic", "chunk_size": 0, "chunk_overlap": 0, "retrieval_mode": "hybrid", "use_reranker": False},
    
    # === HYBRID + RERANKER ===
    {"pipeline_name": "hybrid_recursive_rerank", "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": True},
    {"pipeline_name": "hybrid_sentence_rerank",  "splitter_type": "sentence_transformer", "chunk_size": 256, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": True},
    {"pipeline_name": "hybrid_semantic_rerank",  "splitter_type": "semantic", "chunk_size": 0, "chunk_overlap": 0, "retrieval_mode": "hybrid", "use_reranker": True},
]

In [None]:
all_results = {}
for config in evaluation_matrix:
    pipeline_name = config["pipeline_name"]
    print(f"\n--- GENERATING ANSWERS FOR: {pipeline_name} ---")
    try:
        generated_answers = run_single_test(
            name=name,
            config=config,
            questions=eval_questions,
            llm=llm,
        )
        df_results = pd.DataFrame(generated_answers)
        df_results["question_type"] = question_types
        all_results[pipeline_name] = df_results
        cleanup_memory()
    except Exception as e:
        print(f"[ERROR] Pipeline {pipeline_name} failed: {e}")
        cleanup_memory()
        continue
print(f"[INFO] Successfully completed {len(all_results)} pipelines")

In [10]:
metric_columns = [
        'context_precision',
        'faithfulness',
        'answer_relevancy',
        'context_recall',
        'answer_correctness',
        'semantic_similarity'
    ]

In [None]:
ground_truths = df_eval['ground_truth'].tolist()
final_evaluation_results = {}

# Loop through each test run you've completed
for pipeline_name, df_generated in all_results.items():
    print(f"\n{'='*20} RUNNING RAGAS ON: {pipeline_name} {'='*20}")

    # 1. Prepare the dataset for Ragas
    # Add the ground truth and ensure it's in a list format
    df_generated['ground_truth'] = ground_truths
    df_generated['ground_truths'] = df_generated['ground_truth'].apply(lambda x: [x])

    ragas_dataset = Dataset.from_pandas(df_generated)

    # 2. Run the Ragas evaluation
    ragas_results = evaluate(
        ragas_dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
            answer_correctness,
            answer_similarity,
        ]
    )

    # 3. Combine data with the Ragas scores
    df_ragas_scores = ragas_results.to_pandas()
    df_scores_only = df_ragas_scores[metric_columns]

    # Combine  original DataFrame with the new Ragas scores
    final_df = pd.concat([df_generated.reset_index(drop=True), df_scores_only.reset_index(drop=True)], axis=1)
    if 'ground_truth' in final_df.columns:
        final_df = final_df.drop(columns=['ground_truth'])

    # 4. Save the complete, final result
    final_evaluation_results[pipeline_name] = final_df
    os.makedirs(f"./results/{name}", exist_ok=True)
    final_df.to_csv(f"./results/{name}/{pipeline_name}_final_results.csv", index=False)
    print(f"--- FINISHED: Final results for {pipeline_name} saved. ---")
    print("Final DataFrame columns:", final_df.columns.tolist())

print(f"üéâ EVALUATION COMPLETE! Results saved for {len(final_evaluation_results)} pipelines.")

