In [1]:
import os
import pandas as pd
from datasets import Dataset
import json
from dotenv import load_dotenv

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

In [2]:
# 1. Load API key from .env
load_dotenv()

True

In [3]:
# 2. Path settings
BASELINE_PATH  = "../data/evaluation/all_experiment_generations_for_ragas.csv"
ENHANCED_PATH  = "../data/evaluation/enhanced_experiment_generations.csv"
RESULTS_DIR    = "../results"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [4]:
baseline_df  = pd.read_csv(BASELINE_PATH)
enhanced_df  = pd.read_csv(ENHANCED_PATH)

print("Baseline columns:", baseline_df.columns[:10].tolist(), "...")
print("Enhanced columns:", enhanced_df.columns[:10].tolist(), "...")

Baseline columns: ['question', 'answer', 'q_len', 'q_bin', 'gen_naive_k1_all-MiniLM-L6-v2', 'gen_cot_k1_all-MiniLM-L6-v2', 'gen_persona_k1_all-MiniLM-L6-v2', 'gen_naive_k3_all-MiniLM-L6-v2', 'gen_cot_k3_all-MiniLM-L6-v2', 'gen_persona_k3_all-MiniLM-L6-v2'] ...
Enhanced columns: ['question', 'answer', 'rewritten_query', 'gen_enhanced_k5_persona_mpnet', 'ctx_count_used', 'contexts'] ...


In [5]:
def prepare_ragas_dataset(df, gen_col, ctx_col):
    """Convert CSV DataFrame to RAGAS-compatible Dataset"""
    data = {
        "question": df["question"].astype(str).tolist(),
        "answer": df[gen_col].astype(str).tolist(),
        "retrieved_contexts": [
            ctx.split("\n---\n") if isinstance(ctx, str) and len(ctx.strip()) > 0 else []
            for ctx in df[ctx_col].fillna("")
        ],
        "reference": df["answer"].astype(str).tolist()
    }
    return Dataset.from_dict(data)

In [6]:
SYSTEMS = {
    "naive": {
        "df": baseline_df,
        "gen_col": "gen_naive_k1_all-MiniLM-L6-v2",
        "ctx_col": "contexts"   # original baseline contexts (k=1)
    },
    "best_baseline": {
        "df": baseline_df,
        "gen_col": "gen_persona_k5_all-mpnet-base-v2",
        "ctx_col": "contexts_persona_k5_all-mpnet-base-v2"
    },
    "enhanced": {
        "df": enhanced_df,
        "gen_col": "gen_enhanced_k5_persona_mpnet",
        "ctx_col": "contexts"   
    }
}

In [7]:
metrics = [faithfulness, answer_relevancy, context_precision, context_recall]

results = {}

for name, cfg in SYSTEMS.items():
    print(f"\nRunning RAGAS on {name} pipeline...")

    dataset = prepare_ragas_dataset(cfg["df"], cfg["gen_col"], cfg["ctx_col"])

    results_accum = {m.name: [] for m in metrics}

    # Run row-by-row to allow custom logging
    for i in range(len(dataset)):
        row_ds = dataset.select([i])  # select a single row
        scores = evaluate(
            row_ds,
            metrics,
            batch_size=1,      # force single-row evaluation
            show_progress=False
        )

        # Collect scores for each metric
        for m in metrics:
            results_accum[m.name].append(scores[m.name])

        # Log progress every 10 rows (and at the very end)
        if (i + 1) % 10 == 0 or (i + 1) == len(dataset):
            print(f"   {name} → processed {i+1}/{len(dataset)} rows")

    # Compute average score per metric
    avg_scores = {}
    for m, vals in results_accum.items():
        flat_vals = []
        for v in vals:
            if isinstance(v, list):   # e.g. [0.82]
                flat_vals.extend(v)
            else:
                flat_vals.append(v)
        avg_scores[m] = sum(flat_vals) / len(flat_vals) if flat_vals else None
    
    results[name] = avg_scores
    print(f"Finished {name} pipeline | Averages: {avg_scores}")


Running RAGAS on naive pipeline...
   naive → processed 10/120 rows
   naive → processed 20/120 rows
   naive → processed 30/120 rows
   naive → processed 40/120 rows
   naive → processed 50/120 rows
   naive → processed 60/120 rows
   naive → processed 70/120 rows
   naive → processed 80/120 rows
   naive → processed 90/120 rows
   naive → processed 100/120 rows
   naive → processed 110/120 rows
   naive → processed 120/120 rows
Finished naive pipeline | Averages: {'faithfulness': 0.6976388888888889, 'answer_relevancy': 0.7473360854279245, 'context_precision': 0.7083333332625, 'context_recall': 0.5833333333333334}

Running RAGAS on best_baseline pipeline...
   best_baseline → processed 10/120 rows
   best_baseline → processed 20/120 rows
   best_baseline → processed 30/120 rows
   best_baseline → processed 40/120 rows
   best_baseline → processed 50/120 rows
   best_baseline → processed 60/120 rows
   best_baseline → processed 70/120 rows
   best_baseline → processed 80/120 rows
   b

In [8]:
json_path = os.path.join(RESULTS_DIR, "ragas_eval_results.json")
with open(json_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"RAGAs results saved to {json_path}")
print(json.dumps(results, indent=2))

RAGAs results saved to ../results/ragas_eval_results.json
{
  "naive": {
    "faithfulness": 0.6976388888888889,
    "answer_relevancy": 0.7473360854279245,
    "context_precision": 0.7083333332625,
    "context_recall": 0.5833333333333334
  },
  "best_baseline": {
    "faithfulness": 0.81875,
    "answer_relevancy": 0.7305946047978084,
    "context_precision": 0.7300347221667872,
    "context_recall": 0.7666666666666667
  },
  "enhanced": {
    "faithfulness": 0.8354166666666667,
    "answer_relevancy": 0.7706981561337491,
    "context_precision": 0.8678124999354726,
    "context_recall": 0.7833333333333333
  }
}
