# Recursive Pipeline Rerun (Fixed Splitter) + Chunk-Size Experiment

After fixing the recursive splitter (sliding-window → standard RecursiveCharacterTextSplitter),
this notebook reruns ALL recursive pipelines:

- **4 Chapter 5 replacements** (dense/hybrid × rerank/no-rerank, all 1000/30)
- **9 Chunk-Size Experiment** (500/30, 1000/30, 1500/50 × 3 pipeline types)
- **3 overlap** → 10 unique configs total

**Prerequisites:** Run `python prebuild_all_recursive.py` first to rebuild ALL BM25 indices.

**Results:**
- Chapter 5 results → `./results/Washington/final_run_42Q/` (overwrites old recursive CSVs)
- Chunk-Size results → `./results/Washington/chunk_size_experiment/`

In [None]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.18.8-hotspot"

import torch  # MUST import before pandas to avoid DLL conflict on Windows
import gc
import pandas as pd
from ragas import evaluate
from datasets import Dataset
from packages.person import Person
from packages.globals import EMBEDDINGS
from packages.llm_config import LLMConfig
from packages.evaluate_rag import EvaluationPipeline
from packages.vector_store_handler import VectorStoreHandler
from packages.document_processing import DocumentProcessing
from packages.bm25_retriever import BM25Retriever
from packages.vector_store_handler import HybridRetriever
from packages.init_chain import InitializeQuesionAnsweringChain

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity,
)

%load_ext autoreload
%autoreload 2

In [None]:
def cleanup_memory():
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    print("[INFO] Memory cleanup completed")

In [None]:
name = "Washington"
RESULTS_CHAP5 = f"./results/{name}/final_run_42Q"
RESULTS_CHUNK = f"./results/{name}/chunk_size_experiment"
os.makedirs(RESULTS_CHAP5, exist_ok=True)
os.makedirs(RESULTS_CHUNK, exist_ok=True)
print(f"[INFO] Chapter 5 results: {RESULTS_CHAP5}")
print(f"[INFO] Chunk-Size results: {RESULTS_CHUNK}")

In [None]:
llm_config = LLMConfig(temperature=0.0)
llm = llm_config.get_local_llm(use_openai=False)
print("[INFO] LLM loaded once (Mistral 7B local) - will be reused for all pipelines")

In [None]:
df_eval = pd.read_csv(f'./autogen_questions/{name}/hilfreich.csv')
eval_questions = df_eval["question"].tolist()
question_types = df_eval["question_type"].tolist()
ground_truths = df_eval['ground_truth'].tolist()
print(f"[INFO] Loaded {len(eval_questions)} questions")
print(f"[INFO] Question types: {pd.Series(question_types).value_counts().to_dict()}")

In [None]:
def run_single_test(name: str, config: dict, questions: list[str], llm=None):
    """
    Runs a single pipeline config. Supports both Chapter 5 (base BM25 path)
    and chunk-size experiment (chunk_size-aware BM25 path).
    """
    person = Person(name=name)
    embedding = EMBEDDINGS
    search_kwargs_num = 3

    if llm is None:
        llm_config = LLMConfig(temperature=0.0)
        llm = llm_config.get_local_llm()

    splitter_type = config["splitter_type"]
    chunk_size = config["chunk_size"]
    chunk_overlap = config["chunk_overlap"]
    retrieval_mode = config["retrieval_mode"]
    use_reranker = config["use_reranker"]
    bm25_path_mode = config.get("bm25_path_mode", "chunk_aware")

    vectorstore_handler = VectorStoreHandler(
        embeddings=embedding,
        search_kwargs_num=search_kwargs_num
    )

    database_path = vectorstore_handler._get_vector_store_path(
        vector_store_name=person.name,
        splitter_type=splitter_type,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    print(f"[INFO] FAISS path: {database_path}")

    if not os.path.exists(database_path):
        print(f"FAISS DB not found. Building...")
        doc_processor = DocumentProcessing(embeddings=embedding)
        split_documents = doc_processor.get_chunked_documents(
            directory_path=f"./data/{person.name}",
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
        _, db = vectorstore_handler.create_db_and_retriever(
            chunked_documents=split_documents,
            vector_store_name=person.name,
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
    else:
        print(f"Loading existing FAISS DB...")
        _, db = vectorstore_handler.get_db_and_retriever(
            vector_store_name=person.name,
            splitter_type=splitter_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    if db is None:
        print(f"[ERROR] Database creation/loading failed. Skipping.")
        return []

    if retrieval_mode == 'hybrid':
        # BM25 path depends on mode
        if bm25_path_mode == "base":
            bm25_index_dir = f"./bm25_indexes/{name}_{splitter_type}"
        else:
            bm25_index_dir = f"./bm25_indexes/{name}_{splitter_type}_{chunk_size}_{chunk_overlap}"
        print(f"[INFO] HYBRID Retriever | BM25: {bm25_index_dir} | exists: {os.path.isdir(bm25_index_dir)}")
        if not os.path.isdir(bm25_index_dir):
            print(f"[ERROR] BM25 index not found! Run prebuild_all_recursive.py first.")
            return []
        bm25_retriever = BM25Retriever(bm25_index_dir)
        retriever = HybridRetriever(db=db, bm25_retriever=bm25_retriever, k=search_kwargs_num)
    else:
        print("[INFO] DENSE Retriever")
        retriever = db.as_retriever(search_kwargs={"k": search_kwargs_num})

    qa_chain = InitializeQuesionAnsweringChain(
        llm=llm,
        retriever=retriever,
        db=db,
        person=person,
        search_kwargs_num=search_kwargs_num,
        use_reranker=use_reranker
    )

    eval_pipeline = EvaluationPipeline(
        qa_chain=qa_chain,
        eval_questions=questions
    )

    return eval_pipeline.generate_answers_with_metadata()

In [None]:
# === COMPLETE RECURSIVE PIPELINE MATRIX ===
# 10 unique configs: 4 Chapter 5 + 9 Chunk-Size (3 overlapping)
#
# save_to: where results go
# bm25_path_mode: "base" = ./bm25_indexes/Washington_recursive
#                 "chunk_aware" = ./bm25_indexes/Washington_recursive_1000_30

evaluation_matrix = [
    # ============================================================
    # CHAPTER 5 REPLACEMENTS (use base BM25 path for compatibility)
    # These overwrite the old sliding-window results in final_run_42Q
    # ============================================================
    {"pipeline_name": "dense_recursive",         "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": False, "save_to": ["chap5"]},
    {"pipeline_name": "dense_recursive_rerank",  "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": True,  "save_to": ["chap5"]},
    {"pipeline_name": "hybrid_recursive",        "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": False, "save_to": ["chap5"], "bm25_path_mode": "base"},
    {"pipeline_name": "hybrid_recursive_rerank", "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": True,  "save_to": ["chap5"], "bm25_path_mode": "base"},

    # ============================================================
    # CHUNK-SIZE EXPERIMENT: 500 / 30
    # ============================================================
    {"pipeline_name": "dense_recursive_500",         "splitter_type": "recursive", "chunk_size": 500,  "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": False, "save_to": ["chunk"]},
    {"pipeline_name": "dense_recursive_rerank_500",  "splitter_type": "recursive", "chunk_size": 500,  "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": True,  "save_to": ["chunk"]},
    {"pipeline_name": "hybrid_recursive_rerank_500", "splitter_type": "recursive", "chunk_size": 500,  "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": True,  "save_to": ["chunk"]},

    # ============================================================
    # CHUNK-SIZE EXPERIMENT: 1000 / 30 (overlap with Chapter 5)
    # These are NEW runs with the fixed splitter, saved to chunk_size_experiment
    # The Chapter 5 versions above use the base BM25 path;
    # these use the chunk_aware BM25 path for the chunk-size comparison.
    # Since the FAISS DB is the same (same chunk_size/overlap), answers will be
    # identical for dense pipelines. For hybrid, BM25 index content is the same
    # (same chunks), just different path — so results should also be identical.
    # We re-run anyway for clean, self-contained chunk experiment CSVs.
    # ============================================================
    {"pipeline_name": "dense_recursive_1000",         "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": False, "save_to": ["chunk"]},
    {"pipeline_name": "dense_recursive_rerank_1000",  "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "dense",  "use_reranker": True,  "save_to": ["chunk"]},
    {"pipeline_name": "hybrid_recursive_rerank_1000", "splitter_type": "recursive", "chunk_size": 1000, "chunk_overlap": 30, "retrieval_mode": "hybrid", "use_reranker": True,  "save_to": ["chunk"]},

    # ============================================================
    # CHUNK-SIZE EXPERIMENT: 1500 / 50
    # ============================================================
    {"pipeline_name": "dense_recursive_1500",         "splitter_type": "recursive", "chunk_size": 1500, "chunk_overlap": 50, "retrieval_mode": "dense",  "use_reranker": False, "save_to": ["chunk"]},
    {"pipeline_name": "dense_recursive_rerank_1500",  "splitter_type": "recursive", "chunk_size": 1500, "chunk_overlap": 50, "retrieval_mode": "dense",  "use_reranker": True,  "save_to": ["chunk"]},
    {"pipeline_name": "hybrid_recursive_rerank_1500", "splitter_type": "recursive", "chunk_size": 1500, "chunk_overlap": 50, "retrieval_mode": "hybrid", "use_reranker": True,  "save_to": ["chunk"]},
]

print(f"[INFO] {len(evaluation_matrix)} pipeline configs to run")
for c in evaluation_matrix:
    print(f"  {c['pipeline_name']:<35} size={c['chunk_size']}, mode={c['retrieval_mode']}, rerank={c['use_reranker']}, save={c['save_to']}")

In [None]:
# === GENERATE ANSWERS + RAGAS SCORE + SAVE (per pipeline, crash-safe) ===
metric_columns = [
    'context_precision', 'faithfulness', 'answer_relevancy',
    'context_recall', 'answer_correctness', 'semantic_similarity'
]

completed = []
skipped = []
failed = []

for i, config in enumerate(evaluation_matrix):
    pipeline_name = config["pipeline_name"]
    save_to = config["save_to"]

    # Check if already completed (crash recovery)
    all_exist = True
    for target in save_to:
        if target == "chap5":
            path = f"{RESULTS_CHAP5}/{pipeline_name}_final_results.csv"
        else:
            path = f"{RESULTS_CHUNK}/{pipeline_name}_final_results.csv"
        if not os.path.exists(path):
            all_exist = False
    if all_exist:
        print(f"[SKIP] {pipeline_name} — CSV already exists, skipping.")
        skipped.append(pipeline_name)
        continue

    print(f"\n{'='*60}")
    print(f"[{i+1}/{len(evaluation_matrix)}] PIPELINE: {pipeline_name}")
    print(f"{'='*60}")

    try:
        # --- Step 1: Generate answers (LOCAL, no internet needed) ---
        print(f"[STEP 1/3] Generating answers with Mistral 7B...")
        generated_answers = run_single_test(
            name=name, config=config, questions=eval_questions, llm=llm,
        )
        if not generated_answers:
            print(f"[WARN] No answers returned for {pipeline_name}")
            failed.append(pipeline_name)
            cleanup_memory()
            continue

        df_results = pd.DataFrame(generated_answers)
        df_results["question_type"] = question_types
        df_results["chunk_size"] = config["chunk_size"]
        df_results["chunk_overlap"] = config["chunk_overlap"]
        print(f"[OK] {len(df_results)} answers generated")

        # Save raw answers immediately (backup before RAGAS)
        for target in save_to:
            if target == "chap5":
                backup_path = f"{RESULTS_CHAP5}/{pipeline_name}_answers_raw.csv"
            else:
                backup_path = f"{RESULTS_CHUNK}/{pipeline_name}_answers_raw.csv"
            df_results.to_csv(backup_path, index=False)
            print(f"[BACKUP] Raw answers saved to {backup_path}")

        # --- Step 2: RAGAS scoring (NEEDS INTERNET for OpenAI judge) ---
        print(f"[STEP 2/3] Running RAGAS evaluation (needs internet)...")
        df_results['ground_truth'] = ground_truths
        df_results['ground_truths'] = df_results['ground_truth'].apply(lambda x: [x])

        ragas_dataset = Dataset.from_pandas(df_results)
        ragas_results = evaluate(
            ragas_dataset,
            metrics=[
                context_precision, faithfulness, answer_relevancy,
                context_recall, answer_correctness, answer_similarity,
            ]
        )
        df_ragas_scores = ragas_results.to_pandas()
        df_scores_only = df_ragas_scores[metric_columns]

        final_df = pd.concat([df_results.reset_index(drop=True), df_scores_only.reset_index(drop=True)], axis=1)
        if 'ground_truth' in final_df.columns:
            final_df = final_df.drop(columns=['ground_truth'])

        # --- Step 3: Save final results ---
        print(f"[STEP 3/3] Saving final results...")
        for target in save_to:
            if target == "chap5":
                path = f"{RESULTS_CHAP5}/{pipeline_name}_final_results.csv"
            else:
                path = f"{RESULTS_CHUNK}/{pipeline_name}_final_results.csv"
            final_df.to_csv(path, index=False)
            print(f"[SAVED] {path}")

        f_score = final_df['faithfulness'].mean()
        ar_score = final_df['answer_relevancy'].mean()
        cp_score = final_df['context_precision'].mean()
        cr_score = final_df['context_recall'].mean()
        print(f"  F={f_score:.3f} AR={ar_score:.3f} CP={cp_score:.3f} CR={cr_score:.3f}")
        completed.append(pipeline_name)

    except Exception as e:
        print(f"[ERROR] Pipeline {pipeline_name} failed: {e}")
        import traceback
        traceback.print_exc()
        failed.append(pipeline_name)

    cleanup_memory()

print(f"\n{'='*60}")
print(f"DONE! Completed: {len(completed)}, Skipped: {len(skipped)}, Failed: {len(failed)}")
if failed:
    print(f"Failed pipelines: {failed}")
print(f"{'='*60}")

In [None]:
# === COMPARISON TABLE ===
import glob

print("\n=== CHAPTER 5 RECURSIVE RESULTS (replaced) ===")
print(f"{'Pipeline':<35} {'F':>8} {'AR':>8} {'CP':>8} {'CR':>8}")
print("-" * 70)
for csv_file in sorted(glob.glob(f"{RESULTS_CHAP5}/*recursive*_final_results.csv")):
    df = pd.read_csv(csv_file)
    pname = os.path.basename(csv_file).replace('_final_results.csv', '')
    print(f"{pname:<35} {df['faithfulness'].mean():>8.3f} {df['answer_relevancy'].mean():>8.3f} {df['context_precision'].mean():>8.3f} {df['context_recall'].mean():>8.3f}")

print(f"\n=== CHUNK-SIZE EXPERIMENT RESULTS ===")
print(f"{'Pipeline':<35} {'Size':>6} {'F':>8} {'AR':>8} {'CP':>8} {'CR':>8}")
print("-" * 80)
for csv_file in sorted(glob.glob(f"{RESULTS_CHUNK}/*_final_results.csv")):
    df = pd.read_csv(csv_file)
    pname = os.path.basename(csv_file).replace('_final_results.csv', '')
    cs = int(df['chunk_size'].iloc[0]) if 'chunk_size' in df.columns else '?'
    print(f"{pname:<35} {cs:>6} {df['faithfulness'].mean():>8.3f} {df['answer_relevancy'].mean():>8.3f} {df['context_precision'].mean():>8.3f} {df['context_recall'].mean():>8.3f}")

# Reference: dense_semantic (unaffected by splitter fix)
ref_path = f"{RESULTS_CHAP5}/dense_semantic_final_results.csv"
if os.path.exists(ref_path):
    df_ref = pd.read_csv(ref_path)
    print(f"\n{'dense_semantic (REF)':<35} {'---':>6} {df_ref['faithfulness'].mean():>8.3f} {df_ref['answer_relevancy'].mean():>8.3f} {df_ref['context_precision'].mean():>8.3f} {df_ref['context_recall'].mean():>8.3f}")