In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import json
import numpy as np
import re
from difflib import SequenceMatcher
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.graph_objects as go

In [29]:
base_folder = "/content/drive/My Drive/SNLP/Response_eval"

experiment_folders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]

for experiment_folder in experiment_folders:
    experiment_path = os.path.join(base_folder, experiment_folder)
    json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

    print(f"Checking folder: {experiment_path}")  # Debug print
    print(f"Found JSON files: {json_files}")  # Debug print

    colors = {
        "baseline": "blue",
        "CoT": "green",
        "manually_written": "red"
    }

    models = {
        "baseline": "solid",
        "CoT": "dash",
        "manually_written": "dot"
    }

    k_values = ["k1", "k3", "k5", "k10"]
    metrics = ["bert_f1", "rouge_recall", "faithfulness", "answer_relevancy"]

    for json_file in json_files:
        json_file_path = os.path.join(experiment_path, json_file)
        print(f"Processing JSON file: {json_file_path}")  # Debug print

        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

        # **🔹 1. Plot each metric separately for this dataset**
        for metric in metrics:
            fig = go.Figure()

            for model, line_style in models.items():
                y_values = []
                for k in k_values:
                    # Handling case where metric is inside "ragas_metrics" (stringified dictionary)
                    if metric in data[model][k]:
                        y_values.append(data[model][k][metric])
                    else:
                        y_values.append(eval(data[model][k]["ragas_metrics"])[metric])

                fig.add_trace(go.Scatter(
                    x=k_values, y=y_values,
                    mode="lines+markers",
                    line=dict(dash=line_style),
                    name=f"{model}"
                ))

            # Update layout
            fig.update_layout(
                title=f"{dataset_name} - {metric.capitalize()} vs K",
                xaxis_title="K value",
                yaxis_title=metric.capitalize(),
                hovermode="x unified"
            )

            fig.show()  # Show each plot separately

Checking folder: /content/drive/My Drive/SNLP/Response_eval/gpt_4o_mini
Found JSON files: ['cuad_gpt_evaluation_results_reorganized_by_type.json', 'contractnli_gpt_evaluation_results_reorganized_by_type.json', 'maud_gpt_evaluation_results_reorganized_by_type.json', 'privacyqa_gpt_evaluation_results_reorganized_by_type.json']
Processing JSON file: /content/drive/My Drive/SNLP/Response_eval/gpt_4o_mini/cuad_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/My Drive/SNLP/Response_eval/gpt_4o_mini/contractnli_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/My Drive/SNLP/Response_eval/gpt_4o_mini/maud_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/My Drive/SNLP/Response_eval/gpt_4o_mini/privacyqa_gpt_evaluation_results_reorganized_by_type.json


###**Plot all 4 types (contractnli, cuad, maud, privacy) on same plot for each metric for one reponse type at a time**

In [30]:
# Comparison Plots for Each Metric Across All Datasets (showing one response prompt at a time)
for metric in metrics:
    fig = go.Figure()

    for experiment_folder in experiment_folders:
        experiment_path = os.path.join(base_folder, experiment_folder)
        json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

        for json_file in json_files:
            json_file_path = os.path.join(experiment_path, json_file)

            with open(json_file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

            y_values = []
            for k in k_values:
                # Extract from baseline model for consistency
                if metric in data["baseline"][k]:
                    y_values.append(data["baseline"][k][metric])
                else:
                    y_values.append(eval(data["baseline"][k]["ragas_metrics"])[metric])

            fig.add_trace(go.Scatter(
                x=k_values, y=y_values,
                mode="lines+markers",
                name=dataset_name
            ))

    # Update layout for the metric-specific comparison across datasets
    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets Using Baseline Response only",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()  # Show the plot

###**Plot all 4 types (contractnli, cuad, maud, privacy) and all 3 response types (Cot, manual, etc) on same plot for each metric**

In [31]:
# Comparison Across Datasets (4 plots, one per metric, all domains and prompt types shown in same plot)
for metric in metrics:
    fig = go.Figure()

    for experiment_folder in experiment_folders:
        experiment_path = os.path.join(base_folder, experiment_folder)
        json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

        for json_file in json_files:
            json_file_path = os.path.join(experiment_path, json_file)

            with open(json_file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

            # **Plot all 3 models in the same figure**
            for model, line_style in models.items():
                y_values = []
                for k in k_values:
                    if metric in data[model][k]:
                        y_values.append(data[model][k][metric])
                    else:
                        y_values.append(eval(data[model][k]["ragas_metrics"])[metric])

                fig.add_trace(go.Scatter(
                    x=k_values, y=y_values,
                    mode="lines+markers",
                    line=dict(dash=line_style),
                    name=f"{dataset_name} - {model}"
                ))

    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()

###**Plotting the average scores (across domains) for each metric**

In [32]:
# Store averaged results
dataset_averages = {}

# **🔹 Compute Average Scores Per Dataset Type**
for experiment_folder in experiment_folders:
    experiment_path = os.path.join(base_folder, experiment_folder)
    json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

    dataset_name = experiment_folder  # Assume folder name represents dataset type

    # Initialize dictionary for averaging
    dataset_averages[dataset_name] = {model: {k: {metric: [] for metric in metrics} for k in k_values} for model in models}

    for json_file in json_files:
        json_file_path = os.path.join(experiment_path, json_file)

        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        for model in models:
            for k in k_values:
                for metric in metrics:
                    if metric in data[model][k]:
                        dataset_averages[dataset_name][model][k][metric].append(data[model][k][metric])
                    else:
                        dataset_averages[dataset_name][model][k][metric].append(eval(data[model][k]["ragas_metrics"])[metric])

# Compute the mean values
for dataset_name in dataset_averages:
    for model in models:
        for k in k_values:
            for metric in metrics:
                dataset_averages[dataset_name][model][k][metric] = np.mean(dataset_averages[dataset_name][model][k][metric])


# Comparison of average scores across domains by metric
for metric in metrics:
    fig = go.Figure()

    for dataset_name in dataset_averages:
        for model, line_style in models.items():
            y_values = [dataset_averages[dataset_name][model][k][metric] for k in k_values]

            fig.add_trace(go.Scatter(
                x=k_values, y=y_values,
                mode="lines+markers",
                line=dict(dash=line_style),
                name=f"{dataset_name} - {model}"
            ))

    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()

In [23]:
dataset_averages

{'gpt_4o_mini': {'baseline': {'k1': {'bert_f1': np.float64(0.6756790645574171),
    'rouge_recall': np.float64(0.23319962722333895),
    'faithfulness': np.float64(0.61475),
    'answer_relevancy': np.float64(0.383475)},
   'k3': {'bert_f1': np.float64(0.7064165109672498),
    'rouge_recall': np.float64(0.14159081951908503),
    'faithfulness': np.float64(0.79445),
    'answer_relevancy': np.float64(0.60285)},
   'k5': {'bert_f1': np.float64(0.7062153512439162),
    'rouge_recall': np.float64(0.10204207196866405),
    'faithfulness': np.float64(0.8328),
    'answer_relevancy': np.float64(0.67945)},
   'k10': {'bert_f1': np.float64(0.7059749974341122),
    'rouge_recall': np.float64(0.0598606874629207),
    'faithfulness': np.float64(0.846075),
    'answer_relevancy': np.float64(0.734925)}},
  'CoT': {'k1': {'bert_f1': np.float64(0.657868018047404),
    'rouge_recall': np.float64(0.16143429164836362),
    'faithfulness': np.float64(0.555775),
    'answer_relevancy': np.float64(0.3530749