In [8]:
import os
import json
import numpy as np
import re
from difflib import SequenceMatcher
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd


# Define path to the completed runs directory
completed_runs_dir = "/cs/student/projects2/aisd/2024/amanivan/snlp/completed_runs_2"

def extract_k_from_folder(folder_name):
    match = re.search(r'_(\d+)$', folder_name)
    return int(match.group(1)) if match else None

def is_relevant(retrieved_text, ground_truth_list, threshold=0.6):
    for gt in ground_truth_list:
        similarity = SequenceMatcher(None, retrieved_text, gt).ratio()
        if similarity > threshold:
            return True
    return False

def evaluate_ranking(sorted_chunks, ground_truth, k, n):
    top_n_chunks = sorted_chunks[:n]
    relevant_at_k = [1 if is_relevant(chunk, ground_truth) else 0 for chunk in top_n_chunks[:k]]
    precision = sum(relevant_at_k) / k
    recall = sum(relevant_at_k) / len(ground_truth)
    return {
        "Precision@K": precision,
        "Recall@K": recall
    }

def process_json_file(file_path, k, n):
    with open(file_path, "r") as f:
        data = json.load(f)

    unranked_results, ranked_results = [], []

    for entry in data:
        ground_truth_answers = [s["answer"] for s in entry["snippets"]]
        unranked_chunks = [chunk["text"] for chunk in entry.get("retrieved_chunks_unranked", [])]
        ranked_chunks = sorted(entry.get("retrieved_chunks_ranked", []), key=lambda x: x.get("cross_encoder_score", 0), reverse=True)
        ranked_chunks_text = [chunk["text"] for chunk in ranked_chunks]

        unranked_eval = evaluate_ranking(unranked_chunks, ground_truth_answers, k, n)
        ranked_eval = evaluate_ranking(ranked_chunks_text, ground_truth_answers, k, n)

        unranked_results.append(unranked_eval)
        ranked_results.append(ranked_eval)

    avg_unranked = {key: np.mean([res[key] for res in unranked_results]) for key in unranked_results[0]}
    avg_ranked = {key: np.mean([res[key] for res in ranked_results]) for key in ranked_results[0]}
    return {"unranked": avg_unranked, "ranked": avg_ranked}

def evaluate_all_jsons(n):
    """ Returns a DataFrame of all evaluation results. """
    evaluation_results = []

    for root, dirs, files in os.walk(completed_runs_dir):
        folder_name = os.path.basename(root)
        k = extract_k_from_folder(folder_name)
        if k is None:
            continue

        for file in files:
            if file.startswith("sampled_queries_") and file.endswith(".json"):
                dataset = file.replace("sampled_queries_", "").replace(".json", "")
                file_path = os.path.join(root, file)
                results = process_json_file(file_path, k, n)
                evaluation_results.append({
                    "folder": folder_name,
                    "dataset": dataset,
                    "k": k,
                    "combo": int(folder_name.split("_")[0]),
                    "Precision@K": results["ranked"]["Precision@K"],
                    "Recall@K": results["ranked"]["Recall@K"],
                })

    return pd.DataFrame(evaluation_results)

# Run the evaluation loop
recursive_results = evaluate_all_jsons(n=4)  # Set top-N to 10, change as needed



In [9]:
# def evaluate_all_jsons(n):
#     """ Loops through all JSON files in completed_runs and evaluates them using the correct K. """
#     evaluation_results = []

#     for root, dirs, files in os.walk(completed_runs_dir):
#         folder_name = os.path.basename(root)  # Get the folder name (e.g., "9_5")
#         k = extract_k_from_folder(folder_name)  # Extract K dynamically

#         if k is None:
#             continue  # Skip folders that don't match the pattern `x_y`

#         for file in files:
#             if file.startswith("sampled_queries_") and file.endswith(".json"):
#                 file_path = os.path.join(root, file)
#                 results = process_json_file(file_path, k, n)
                
#                 evaluation_results.append({
#                     "folder": folder_name,  # Store folder name for hover interaction
#                     "k": k,
#                     "Precision@K": results["ranked"]["Precision@K"],  # Using ranked results
#                     "Recall@K": results["ranked"]["Recall@K"],
#                     # "MRR": results["ranked"]["MRR"],
#                 })

#     # Convert to DataFrame
#     df = pd.DataFrame(evaluation_results)
    
#     # Generate interactive scatter plot
#     # plot_evaluation_results_from_df(df)
#     return df

# # Run the evaluation loop with automatic visualization
# recursive_results = evaluate_all_jsons(n=3)  # Set top-N to 3, change as needed

In [20]:
import plotly.express as px
import pandas as pd
import os

def plot_evaluation_results_from_df(df):
    """ Plots an interactive scatter plot of Precision@K vs Recall@K with folder names as hover labels. """
    fig = px.scatter(
        df, x="Precision@K", y="Recall@K",
        color="k",  # Color by K value
        # text="folder",  # Show folder name on hover
        title="Interactive Scatter Plot of Precision vs Recall (Hover Shows Folder Name)",
        labels={"k": "K Value"},
        hover_data={"folder": True, "k": True},  # Ensure hover shows folder & k
        
    )

    fig.update_traces(marker=dict(
        line=dict(width=1, color='Black')
    ))

    fig.update_layout(
        width=1000,  # Set the width of the figure
        height=800,  # Set the height of the figure
    )
    
    fig.show()  # Display interactive plot

plot_evaluation_results_from_df(recursive_results)



In [11]:
import sys
!pip install nbformat ipython notebook


Defaulting to user installation because normal site-packages is not writeable


In [19]:
import plotly.express as px

combo_labels = {
    7: "RCTS_SBERT_Cosine",
    8: "RCTS_SBERT_BM25",
    9: "RCTS_DSbert_Cosine",
    10: "RCTS_DSbert_BM25",
    11: "RCTS_GTE_Cosine",
    12: "RCTS_GTE_BM25"
}

def prepare_longform_data(df):
    df = df.copy()
    df["combo"] = df["folder"].apply(lambda x: int(x.split("_")[0]))
    df["combo_label"] = df["combo"].map({
        7: "RCTS_SBERT_Cosine",
        8: "RCTS_SBERT_BM25",
        9: "RCTS_DSBERT_Cosine",
        10: "RCTS_DSBERT_BM25",
        11: "RCTS_GTE_Cosine",
        12: "RCTS_GTE_BM25",
    })

    # Long-form transformation
    long_df = df.melt(
        id_vars=["dataset", "combo", "combo_label", "k", "folder"],
        value_vars=["Precision@K", "Recall@K"],
        var_name="Metric",
        value_name="Score"
    )
    return long_df

def plot_metric_by_k_separate(long_df):
    datasets = long_df["dataset"].unique()
    
    for dataset in datasets:
        df_dataset = long_df[long_df["dataset"] == dataset]
        df_sorted = df_dataset.sort_values(by=["combo_label", "k"])

        print(f"📊 Plotting for dataset: {dataset}")

        # Precision Plot
        df_precision = df_sorted[df_sorted["Metric"] == "Precision@K"]
        fig_precision = px.line(
            df_precision,
            x="k", y="Score",
            color="combo_label",
            markers=True,
            title=f"Precision@K for {dataset} for range of K-values",
            labels={"k": "K Value", "Score": "Precision"},
            hover_data={"folder": True, "k": True}
        )
        fig_precision.update_layout(width=1000, height=500)
        fig_precision.show()

        # Recall Plot
        df_recall = df_sorted[df_sorted["Metric"] == "Recall@K"]
        fig_recall = px.line(
            df_recall,
            x="k", y="Score",
            color="combo_label",
            markers=True,
            title=f"Recall@K for {dataset} for range of K-values",
            labels={"k": "K Value", "Score": "Recall"},
            hover_data={"folder": True, "k": True}
        )
        fig_recall.update_layout(width=1000, height=500)
        fig_recall.show()

# #  Run evaluation and plotting
# results_df = evaluate_all_jsons(n=3)  #change n as needed
# plot_metric_per_dataset(results_df)


long_df = prepare_longform_data(recursive_results)
plot_metric_by_k_separate(long_df)

📊 Plotting for dataset: maud


📊 Plotting for dataset: contractnli


📊 Plotting for dataset: cuad


📊 Plotting for dataset: privacy_qa
