## generative models test on student lease (manually annoated data -> 500+ questions 50+ hours of work )


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path

# In Google Colab, if your notebook is in /content and the files are as above:
BASE_DIR = Path("/content/drive/My Drive/Dissertation/phase_2/generative")
AGREEMENTS_DIR = BASE_DIR / "agreements"
GOLD_STANDARD_JSON = BASE_DIR / "gold_standard.json"
PROCESSED_DIR = BASE_DIR / "processed"         # output for processed main PDFs
IMAGES_DIR = BASE_DIR / "images"



print("Base Directory:", BASE_DIR)
print("Agreements Directory:", AGREEMENTS_DIR)
print("Gold Standard JSON:", GOLD_STANDARD_JSON)


In [None]:
pip install pymupdf4llm pdfplumber transformers rank_bm25 whoosh faiss-cpu evaluate bert-score tqdm rouge_score

# IMPORTS & CONFIGURATION & PATHS

In [36]:

##############################
# Imports
##############################
import os
import time  # ensure time is imported
import re
import json
import string
from pathlib import Path
from typing import List, Dict
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Use your PDF-to-markdown converter; here we use pymupdf4llm as in your snippet.
import pymupdf4llm

# For alternative PDF extraction (if needed)
import pdfplumber

# For robust token counting, we use a Hugging Face tokenizer.
from transformers import AutoTokenizer, pipeline

# For TF-IDF retrieval:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For BM25 retrieval:
from rank_bm25 import BM25Okapi


# For FIASS
import faiss
from sentence_transformers import SentenceTransformer


# For metrics:
import numpy as np
import evaluate
from bert_score import score as bertscore_score
import torch



##############################
# CONFIGURATION
##############################

# Maximum token limit for each chunk (set below 512 to allow room for question tokens and special tokens)
MAX_CHUNK_TOKENS = 400

# Choose a tokenizer for counting tokens (using a BERT tokenizer as an example)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


# PDF PREPROCESSING & MARKDOWN EXTRACTION

In [37]:
def pdf_to_markdown(pdf_path: Path) -> str:

    """
    Convert a PDF to markdown using pymupdf4llm.
    If there are issues with pymupdf4llm, consider switching to pdfplumber.
    """

    return pymupdf4llm.to_markdown(str(pdf_path))

def clean_text(text: str) -> str:

    """
    Remove unwanted artifacts and non-ASCII characters.
    """

    text = re.sub(r'â', '', text)
    text = re.sub(r'\*+', '', text)
    text = re.sub(r'â*', '', text)
    text = text.encode("ascii", errors="ignore").decode("ascii")

    return text.strip()

def split_markdown_by_headers(markdown: str) -> List[Dict[str, str]]:

    """
    Splits markdown text into sections based on headers.
    Returns a list of dicts with keys 'title' and 'content'.
    """

    sections = []
    current_section = {"title": None, "content": ""}

    for line in markdown.splitlines():

        header_match = re.match(r'^(#{1,6})\s+(.*)', line)
        if header_match:
            if current_section["title"] is not None or current_section["content"].strip():
                sections.append(current_section)
            title = header_match.group(2).strip()
            current_section = {"title": title, "content": ""}
        else:
            current_section["content"] += line + "\n"

    if current_section["title"] is not None or current_section["content"].strip():
        sections.append(current_section)

    return sections

def process_content(text: str) -> str:

    """
    Replace newline characters with a space and collapse extra spaces.
    """

    text = text.replace("\n", " ")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def preprocess_markdown_file(file_path: Path) -> List[Dict[str, str]]:

    """
    Process a markdown file: clean, split by headers, process content,
    and filter out trivial sections.
    """

    with open(file_path, 'r', encoding='utf-8') as f:
        markdown = f.read()
    markdown = clean_text(markdown)
    sections = split_markdown_by_headers(markdown)

    for sec in sections:
        sec["content"] = process_content(sec["content"])

    # Filter out sections that are too trivial
    filtered_sections = []

    for sec in sections:

        if not sec["content"].strip():
            continue
        filtered_sections.append(sec)

    return filtered_sections

def save_to_json(data, filename: Path):

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"Data saved to {filename}")

def process_all_agreement_pdfs():

    """
    Loop over all main agreement PDFs (skip Q&A PDFs that contain '_QA' in the filename)
    and produce a JSON file per agreement.
    """

    for pdf_file in AGREEMENTS_DIR.glob("Agreement_*.pdf"):

        if "_QA" in pdf_file.stem:
            continue  # Skip Q&A PDFs
        print(f"Processing {pdf_file.name} ...")

        # Convert PDF to markdown text.
        md_text = pdf_to_markdown(pdf_file)

        # Save the markdown to the output directory using the correct path.
        output_md_path = AGREEMENTS_DIR / f"{pdf_file.stem}.md"
        output_md_path.write_text(md_text, encoding="utf-8")
        print(f"Saved markdown to {output_md_path}")

        sections = preprocess_markdown_file(pdf_file.with_suffix(".md"))
        # In case you haven't already saved markdown to disk, you can also do:
        # sections = split_markdown_by_headers(clean_text(md_text))
        # Save processed sections to JSON:
        output_json = PROCESSED_DIR / f"{pdf_file.stem}.json"
        save_to_json(sections, output_json)


##############################
# FURTHER CHUNKING USING TOKEN COUNTS
##############################

def chunk_section_by_tokens(section: Dict[str, str], max_tokens: int = MAX_CHUNK_TOKENS) -> List[Dict[str, str]]:

    """
    Use the Hugging Face tokenizer to count tokens and split a section's content into sub‐chunks.
    The method splits on sentence boundaries if possible.
    """

    text = section["content"]

    # Tokenize using the model's tokenizer (which returns token IDs)
    tokens = tokenizer.tokenize(text)

    if len(tokens) <= max_tokens:
        return [section]

    # For a better split, we can try to split by sentences.
    # Here we use a naive regex sentence split; you might also use nltk.sent_tokenize.
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    current_tokens = []

    for sent in sentences:
        sent_tokens = tokenizer.tokenize(sent)

        # If adding the sentence exceeds max_tokens, store the current chunk.
        if len(current_tokens) + len(sent_tokens) > max_tokens:
            if current_chunk:
                chunks.append({
                    "title": section["title"],
                    "content": current_chunk.strip()
                })
            # Start a new chunk with this sentence.
            current_chunk = sent + " "
            current_tokens = sent_tokens
        else:
            current_chunk += sent + " "
            current_tokens += sent_tokens

    if current_chunk:
        chunks.append({
            "title": section["title"],
            "content": current_chunk.strip()
        })

    return chunks

def further_chunk_sections(sections: List[Dict[str, str]], max_tokens: int = MAX_CHUNK_TOKENS) -> List[Dict[str, str]]:

    """
    Apply token-based chunking to all sections.
    """

    final_chunks = []

    for sec in sections:
        sub_chunks = chunk_section_by_tokens(sec, max_tokens=max_tokens)
        final_chunks.extend(sub_chunks)

    return final_chunks


# GOLD STANDARD Q&A EXTRACTION

In [38]:
def save_to_json_QA(data, filename):

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Gold standard JSON saved to {filename}")



def extract_text_from_pdf(pdf_path):

    """Extract text from a PDF file using pdfplumber."""

    text = ""

    with pdfplumber.open(pdf_path) as pdf:

        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    return text

def extract_qa_pairs(text):

    """
    Extract Q&A pairs from text.
    Assumes a format where each pair starts with 'Question <number>:'
    and then 'Answer :' with the answer continuing until the next question or end-of-text.
    """

    qa_pattern = re.compile(
        r"Question\s*(\d+)\s*:\s*(.*?)\s*Answer\s*:\s*(.*?)(?=Question\s*\d+\s*:|$)",
        re.DOTALL | re.IGNORECASE
    )

    qa_pairs = []

    for match in qa_pattern.finditer(text):

        question_num, question, answer = match.groups()
        qa_pairs.append({
            "question_number": question_num.strip(),
            "question": question.strip(),
            "answer": answer.strip()
        })

    return qa_pairs


def build_gold_standard():

    """
    Loop over all Q&A PDFs and build a dictionary.
    Save as a single JSON file.
    """

    # Built a dictionary to hold the gold standard for all agreements
    gold_standard = {}

    # Looping over all Q&A PDFs in the agreements directory

    for pdf_file in AGREEMENTS_DIR.glob("*_QA.pdf"):

        # Extracting an agreement identifier from the filename, "Agreement_N"
        agreement_id = pdf_file.stem.split("_QA")[0]
        print(f"Processing Q&A for {agreement_id} from {pdf_file.name}")

        # Extract text and then Q&A pairs using pdfplumber
        text = extract_text_from_pdf(pdf_file)
        qa_pairs = extract_qa_pairs(text)

        # Store the result in the gold standard dictionary
        gold_standard[agreement_id] = qa_pairs

    # Order the dictionary by the numeric part of the agreement id.
    # Assuming agreement IDs are in the form "Agreement_<number>"
    ordered_gold_standard = dict(

        sorted(
            gold_standard.items(),
            key=lambda x: int(x[0].split('_')[1]) if x[0].split('_')[1].isdigit() else 0
        )

    )

    save_to_json(ordered_gold_standard, GOLD_STANDARD_JSON)



# GENERTAIVE RETRIEVAL FUNCTIONS (FAISS only and FAISS + BM25 Hybrid.)

In [39]:
class FaissRetriever:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.embedder = SentenceTransformer(model_name)
        self.index = None
        self.chunk_texts = None

    def build_index(self, chunks: List[Dict[str, str]]):
        self.chunk_texts = [chunk["content"] for chunk in chunks]
        embeddings = self.embedder.encode(self.chunk_texts, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings)

    def search(self, query: str, top_k: int = 3) -> List[Dict[str, any]]:
        q_embed = self.embedder.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(q_embed)
        distances, indices = self.index.search(q_embed, top_k)
        results = []
        for dist, idx in zip(distances[0], indices[0]):
            results.append({"text": self.chunk_texts[idx], "score": float(dist)})
        return results

class BM25Retriever:
    # Simple BM25 retrieval based on tokenized content.
    def index(self, chunks: List[Dict[str, str]]):
        from rank_bm25 import BM25Okapi
        self.docs = [chunk["content"] for chunk in chunks]
        self.tokenized_docs = [doc.split() for doc in self.docs]  # very basic tokenization
        self.bm25 = BM25Okapi(self.tokenized_docs)
    
    def search(self, query: str, top_k: int = 3) -> List[Dict[str, any]]:
        query_tokens = query.split()
        scores = self.bm25.get_scores(query_tokens)
        ranked_idx = np.argsort(scores)[::-1][:top_k]
        results = [{"text": self.docs[idx], "score": float(scores[idx])} for idx in ranked_idx]
        return results

class HybridRetriever:
    """
    Combine FAISS and BM25 scores by taking a simple average of the normalized scores.
    """
    def __init__(self):
        self.faiss_retriever = FaissRetriever()
        self.bm25_retriever = BM25Retriever()
    
    def index(self, chunks: List[Dict[str, str]]):
        self.faiss_retriever.build_index(chunks)
        self.bm25_retriever.index(chunks)
        # Save chunks for later (they should be the same for both methods)
        self.chunk_texts = [chunk["content"] for chunk in chunks]
    
    def search(self, query: str, top_k: int = 3) -> List[Dict[str, any]]:
        # Retrieve using FAISS and BM25.
        faiss_results = self.faiss_retriever.search(query, top_k)
        bm25_results = self.bm25_retriever.search(query, top_k)
        # Normalize scores (we assume scores are between 0 and 1 for FAISS, but BM25 might need normalization)
        # Here, we take a simple average based on rank positions.
        combined = []
        for i in range(top_k):
            # If one method returns fewer than top_k, use 0 for missing scores.
            faiss_score = faiss_results[i]["score"] if i < len(faiss_results) else 0
            bm25_score = bm25_results[i]["score"] if i < len(bm25_results) else 0
            avg_score = (faiss_score + bm25_score) / 2.0
            # For context, choose the one with higher individual score (or you could choose to combine texts)
            best_text = faiss_results[i]["text"] if faiss_score >= bm25_score else bm25_results[i]["text"]
            combined.append({"text": best_text, "score": avg_score})
        return combined


# GENERATIVE MODELS LOADING

In [40]:

def load_generative_pipelines():
    
    return {
        "mistralai": pipeline("text2text-generation", 
                              model="mistralai/Mistral-7B-Instruct-v0.2", 
                              device=0),

        "legal_llama": pipeline("text2text-generation", 
                                model="simmo/legal-llama-3", 
                                device=0)

    }

# RUNNING THE RETRIEVAL & QA EXPERIMENT & EVALUATION METRICS

In [41]:
# ----- Experiment for Generative Models -----
def run_gen_experiments(gold_data, processed_docs, gen_pipelines, retriever_dict, top_k=3):
    """
    For each agreement, for each Q&A pair from the gold standard:
      - Retrieve top context chunks using each retrieval strategy.
      - For each retrieval strategy, generate an answer using the generative model.
    In this experiment we test two retrieval strategies:
      1. FAISS only.
      2. Hybrid (FAISS + BM25).
    """
    results = []

    for agreement_id, qa_pairs in tqdm(gold_data.items(), desc="Processing Agreements"):

        if agreement_id not in processed_docs:
            print(f"Warning: No processed chunks for {agreement_id}")

            continue

        chunks = processed_docs[agreement_id]
        
        # Skip if chunks are empty or contain no useful text.
        if not chunks or all(not re.search(r'\w', chunk.get("content", "")) for chunk in chunks):
            print(f"Skipping {agreement_id} due to empty content.")
            continue

        # Further chunk if needed.
        chunks = further_chunk_sections(chunks, max_tokens=400)
        
        for retrieval_strategy, retriever in retriever_dict.items():


            # For each retrieval strategy, index the chunks.
            retriever.index(chunks)

            for qa in qa_pairs:

                question = qa["question"]
                gold_answer = qa["answer"]
                start_time = time.time()

                retrieved = retriever.search(question, top_k=top_k)

                # Combine the retrieved context into a single prompt.
                context = " ".join([item["text"] for item in retrieved])

                # Create a prompt: include the question and the context.

                prompt = (
                            "Answer the question in one or two sentences and be direct as possible.\n"
                            "Do not repeat the question or the context; only provide the final answer.\n"
                            f"Question: {question}\n"
                            f"Context: {context}\n"
                            "Final Answer:"
                        )

                # For each generative model, generate an answer.
                for model_name, gen_pipeline in gen_pipelines.items():
                    gen_start = time.time()

                    # Generate output; adjust max_length and other parameters as needed.
                    gen_output = gen_pipeline(prompt, max_length=200, truncation=True)

                    elapsed_model = time.time() - gen_start
                    # Assume the generated text is in gen_output[0]['generated_text']
                    pred_answer = gen_output[0]['generated_text'].strip()
                    total_time = time.time() - start_time

                    results.append({
                        "retrieval_strategy": retrieval_strategy,
                        "Model": model_name,
                        "agreement_id": agreement_id,
                        "question": question,
                        "gold_answer": gold_answer,
                        "pred_answer": pred_answer,
                        "time": total_time,
                        "model_time": elapsed_model  # generation time only
                    })
    return results



##############################
#EVALUATION METRICS
##############################

def normalize_text(s):

    s = s.lower().strip()
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r'\s+', ' ', s)
    return s


def tokenize_text(s):

    return normalize_text(s).split()



def evaluate_gen(results: List[Dict[str, str]]) -> Dict:
    from collections import defaultdict
    rouge = evaluate.load("rouge")
    grouped = defaultdict(lambda: {"preds": [], "golds": [], "times": []})
    
    for res in results:
        key = (res["retrieval_strategy"], res["Model"])
        grouped[key]["preds"].append(res["pred_answer"])
        grouped[key]["golds"].append(res["gold_answer"])
        grouped[key]["times"].append(res.get("time", 0))
    
    final = {}
    for key, data in grouped.items():
        # Compute ROUGE-L for generative answers.
        rouge_scores = rouge.compute(predictions=data["preds"], references=data["golds"])
        avg_rouge_l = rouge_scores["rougeL"]
        
        # Compute BERTScore as semantic similarity measure.
        from bert_score import score as bertscore_score
        P, R, F = bertscore_score(data["preds"], data["golds"], lang="en")
        avg_bertscore = float(torch.mean(F))
        avg_time = np.mean(data["times"])
        final[key] = {
            "ROUGE-L": avg_rouge_l,
            "BERTScore": avg_bertscore,
            "Avg Time": avg_time
        }
    return final




# MAIN EXECUTION

In [None]:

# ----- Main Function for Generative Experiment -----




def plot_aggregated_metrics(df_agg, images_dir):
    # Select only the numeric columns (or explicitly define them)
    metric_columns = ["Exact Match", "F1", "Partial F1", "ROUGE-L", "BERTScore", "Semantic Similarity", "Avg Time", "Avg Confidence"]
    # Group by Model and average over retrievers.
    models_avg = df_agg.groupby("Model")[metric_columns].mean().reset_index()

    # Reshape the data for plotting
    df_models = pd.melt(models_avg, id_vars=["Model"], value_vars=metric_columns,
                        var_name="Metric", value_name="Value")

    plt.figure(figsize=(12, 6))
    for metric in metric_columns:
        subset = df_models[df_models["Metric"] == metric]
        plt.bar(subset["Model"] + " (" + metric + ")", subset["Value"])
    plt.xticks(rotation=45, ha="right")
    plt.title("Aggregated Metrics per QA Model (Averaged over Retrievers)")
    plt.ylabel("Value")
    plt.tight_layout()
    plt.savefig(images_dir / "chart_aggregated_metrics.png", dpi=300)
    plt.show()


def plot_metrics_grouped_by_model(df_agg, images_dir):
    # Define the list of metrics you want to plot.
    metrics_list = ["Exact Match", "F1", "Partial F1", "ROUGE-L", "BERTScore", "Semantic Similarity", "Avg Time", "Avg Confidence"]
    
    # Create one subplot per metric.
    n_metrics = len(metrics_list)
    fig, axes = plt.subplots(n_metrics, 1, figsize=(10, 4 * n_metrics))
    
    for i, metric in enumerate(metrics_list):
        # Pivot the DataFrame: rows = Model, columns = Retriever, values = the metric.
        pivot = df_agg.pivot(index="Model", columns="Retriever", values=metric)
        
        # Plot the pivot table as a grouped bar chart.
        pivot.plot(kind="bar", ax=axes[i])
        axes[i].set_title(f"{metric} by Model and Retrieval Strategy")
        axes[i].set_xlabel("Model")
        axes[i].set_ylabel(metric)
        axes[i].legend(title="Retriever")
    
    plt.tight_layout()
    plt.savefig(images_dir / "chart_grouped_by_model.png", dpi=300)
    plt.show()




def plot_pivoted_metrics(df_agg, images_dir):
    metrics_list = ["Exact Match", "F1", "Partial F1", "ROUGE-L", "BERTScore", "Semantic Similarity", "Avg Time", "Avg Confidence"]
    n_metrics = len(metrics_list)
    n_cols = 2
    n_rows = (n_metrics + n_cols - 1) // n_cols  # ceiling division
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows * 4))
    axes = axes.flatten()
    for i, metric in enumerate(metrics_list):
        pivot = df_agg.pivot(index="Retriever", columns="Model", values=metric)
        pivot.plot(kind="bar", ax=axes[i])
        axes[i].set_title(f"Average {metric}: QA Models vs. Retrieval Methods")
        axes[i].set_xlabel("Retriever")
        axes[i].set_ylabel(metric)
        axes[i].legend(title="Model")
        axes[i].set_xticklabels(pivot.index, rotation=0)
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.tight_layout()
    plt.savefig(images_dir / "chart_pivoted_metrics.png", dpi=300)
    plt.show()

    
def plot_processing_time_line(df_detailed, images_dir):
    plt.figure(figsize=(10,6))
    for (retriever, model), subset in df_detailed.groupby(["retriever", "Model"]):
        label = f"{retriever}-{model}"
        plt.plot(subset.index, subset["time"], marker="o", label=label)
    plt.title("Time per QA Example by Retriever-Model Combination")
    plt.xlabel("Example Index")
    plt.ylabel("Time (seconds)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(images_dir / "chart_processing_time_line.png", dpi=300)
    plt.show()

def plot_scatter_with_best_fit(df, x_col, y_col, title, xlabel, ylabel, save_filename, images_dir):
    x = df[x_col].values
    y = df[y_col].values
    coeffs = np.polyfit(x, y, 1)  # Linear regression coefficients
    line_x = np.linspace(x.min(), x.max(), 100)
    line_y = np.polyval(coeffs, line_x)
    
    plt.figure(figsize=(8,6))
    plt.scatter(x, y, alpha=0.6, label="Data points")
    plt.plot(line_x, line_y, color='red', label="Best-fit line")
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(images_dir / save_filename, dpi=300)
    plt.show()


def plot_boxplot_time(df_detailed, images_dir):
    df_detailed['Combination'] = df_detailed['retriever'] + "-" + df_detailed['Model']
    combinations = df_detailed['Combination'].unique()
    data = [df_detailed[df_detailed['Combination'] == comb]['time'].values for comb in combinations]
    plt.figure(figsize=(10,6))
    plt.boxplot(data, labels=combinations, patch_artist=True)
    plt.xlabel("Retriever-Model Combination")
    plt.ylabel("Time (seconds)")
    plt.title("Box Plot: Processing Time per Combination")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(images_dir / "boxplot_time.png", dpi=300)
    plt.show()


def plot_cdf_time(df_detailed, images_dir):
    sorted_time = np.sort(df_detailed['time'].values)
    cdf = np.arange(len(sorted_time)) / float(len(sorted_time))
    plt.figure(figsize=(8,6))
    plt.plot(sorted_time, cdf, marker=".", linestyle="none")
    plt.xlabel("Time (seconds)")
    plt.ylabel("CDF")
    plt.title("CDF of Processing Time for Q&A Examples")
    plt.tight_layout()
    plt.savefig(images_dir / "cdf_time.png", dpi=300)
    plt.show()




def main():


    # 1. Preprocess and build gold standard as before.
    process_all_agreement_pdfs()

    
    build_gold_standard()
    
    with open(GOLD_STANDARD_JSON, "r", encoding="utf-8") as f:
        gold_data = json.load(f)


    print("Gold data keys:", list(gold_data.keys()))
    
    processed_docs = {}


    for json_file in PROCESSED_DIR.glob("Agreement_*.json"):
        agreement_id = json_file.stem
        with open(json_file, "r", encoding="utf-8") as f:
            processed_docs[agreement_id] = json.load(f)
    
    # 2. Load generative QA pipelines.
    gen_pipelines = load_generative_pipelines()
    
    # 3. Initialize retrieval strategies:

    # Strategy 1: FAISS only.
    faiss_only = FaissRetriever()


    # Strategy 2: Hybrid: FAISS + BM25.
    hybrid = HybridRetriever()
    
    retrieval_strategies = {
        "faiss": faiss_only,
        "hybrid": hybrid
    }
    
    # 4. Run experiments using the generative models.
    results = run_gen_experiments(gold_data, processed_docs, gen_pipelines, retrieval_strategies, top_k=3)
    print("Generated results keys:", results[0].keys())
    
    # Save detailed results.
    df_detailed = pd.DataFrame(results)
    detailed_csv_path = BASE_DIR / "detailed_results_gen.csv"
    df_detailed.to_csv(detailed_csv_path, index=False)
    print(f"Detailed generative results saved to {detailed_csv_path}")
    

    # 5. Evaluate the generative experiment.
    metrics = evaluate_gen(results)
    metric_rows = []

    for (strategy, model), scores in metrics.items():
        row = {"Retrieval Strategy": strategy, "Model": model}
        row.update(scores)
        metric_rows.append(row)


    df_agg = pd.DataFrame(metric_rows)
    agg_csv_path = BASE_DIR / "aggregated_metrics_gen.csv"
    df_agg.to_csv(agg_csv_path, index=False)
    print(f"Aggregated generative metrics saved to {agg_csv_path}")
    
    print(df_agg.dtypes)
    print(df_agg.head())



    plot_aggregated_metrics(df_agg, IMAGES_DIR)
    plot_metrics_grouped_by_model(df_agg, IMAGES_DIR)
    plot_pivoted_metrics(df_agg, IMAGES_DIR)
    plot_processing_time_line(df_detailed, IMAGES_DIR)

    plot_scatter_with_best_fit(df_detailed, "confidence", "BERTScore",
                           "Scatter Plot: BERTScore vs Confidence with Best-fit Line",
                           "Confidence Score", "BERTScore", "scatter_bert_sem_sim_conf.png", IMAGES_DIR)

    plot_scatter_with_best_fit(df_detailed, "confidence", "Semantic Similarity",
                            "Scatter Plot: Semantic Similarity vs Confidence with Best-fit Line",
                            "Confidence Score", "Semantic Similarity", "scatter_sem_sim_conf.png", IMAGES_DIR)

    plot_scatter_with_best_fit(df_detailed, "confidence", "F1",
                            "Scatter Plot: F1 vs Confidence with Best-fit Line",
                            "Confidence Score", "F1 Score", "scatter_f1_conf.png", IMAGES_DIR)
    







if __name__ == "__main__":
    main()

# Download all files