In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import numpy as np
import re
from difflib import SequenceMatcher
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import itertools



In [7]:
base_folder = "/content/drive/MyDrive/SNLP/Plot"

experiment_folders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]

for experiment_folder in experiment_folders:
    experiment_path = os.path.join(base_folder, experiment_folder)
    json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

    print(f"Checking folder: {experiment_path}")  # Debug print
    print(f"Found JSON files: {json_files}")  # Debug print

    colors = {
        "baseline": "blue",
        "CoT": "green",
        "manually_written": "red"
    }

    models = {
        "baseline": "solid",
        "CoT": "dash",
        "manually_written": "dot"
    }

    k_values = ["k1", "k3", "k5", "k10"]
    metrics = ["bert_f1", "rouge_recall", "faithfulness", "answer_relevancy"]

    for json_file in json_files:
        json_file_path = os.path.join(experiment_path, json_file)
        print(f"Processing JSON file: {json_file_path}")  # Debug print

        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

        # **🔹 1. Plot each metric separately for this dataset**
        for metric in metrics:
            fig = go.Figure()

            for model, line_style in models.items():
                y_values = []
                for k in k_values:
                    # Handling case where metric is inside "ragas_metrics" (stringified dictionary)
                    if metric in data[model][k]:
                        y_values.append(data[model][k][metric])
                    else:
                        y_values.append(eval(data[model][k]["ragas_metrics"])[metric])

                fig.add_trace(go.Scatter(
                    x=k_values, y=y_values,
                    mode="lines+markers",
                    line=dict(dash=line_style),
                    name=f"{model}"
                ))

            # Update layout
            fig.update_layout(
                title=f"{dataset_name} - {metric.capitalize()} vs K",
                xaxis_title="K value",
                yaxis_title=metric.capitalize(),
                hovermode="x unified"
            )

            fig.show()  # Show each plot separately

Checking folder: /content/drive/MyDrive/SNLP/Plot/GPT-4o-mini
Found JSON files: ['contractnli_gpt_evaluation_results_reorganized_by_type.json', 'cuad_gpt_evaluation_results_reorganized_by_type.json', 'maud_gpt_evaluation_results_reorganized_by_type.json', 'privacyqa_gpt_evaluation_results_reorganized_by_type.json']
Processing JSON file: /content/drive/MyDrive/SNLP/Plot/GPT-4o-mini/contractnli_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/GPT-4o-mini/cuad_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/GPT-4o-mini/maud_gpt_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/GPT-4o-mini/privacyqa_gpt_evaluation_results_reorganized_by_type.json


Checking folder: /content/drive/MyDrive/SNLP/Plot/Llama-3(8B)
Found JSON files: ['contractnli_llama3_evaluation_results_reorganized_by_type.json', 'cuad_llama3_evaluation_results_reorganized_by_type.json', 'privacyqa_llama3_evaluation_results_reorganized_by_type.json', 'maud_llama3_evaluation_results_reorganized_by_type.json']
Processing JSON file: /content/drive/MyDrive/SNLP/Plot/Llama-3(8B)/contractnli_llama3_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/Llama-3(8B)/cuad_llama3_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/Llama-3(8B)/privacyqa_llama3_evaluation_results_reorganized_by_type.json


Processing JSON file: /content/drive/MyDrive/SNLP/Plot/Llama-3(8B)/maud_llama3_evaluation_results_reorganized_by_type.json


###**Plot all 4 types (contractnli, cuad, maud, privacy) on same plot for each metric for one reponse type at a time**

In [26]:
metrics = ["bert_f1", "rouge_recall", "faithfulness", "answer_relevancy"]
k_values = ["k1", "k3", "k5", "k10"]

# Comparison Plots for Each Metric Across All Datasets (showing one response prompt at a time)
for metric in metrics:
    fig = go.Figure()

    for experiment_folder in experiment_folders:
        experiment_path = os.path.join(base_folder, experiment_folder)
        json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

        for json_file in json_files:
            json_file_path = os.path.join(experiment_path, json_file)

            with open(json_file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

            y_values = []
            for k in k_values:
                # Extract from baseline model for consistency
                if metric in data["baseline"][k]:
                    y_values.append(data["baseline"][k][metric])
                else:
                    y_values.append(eval(data["baseline"][k]["ragas_metrics"])[metric])

            fig.add_trace(go.Scatter(
                x=k_values, y=y_values,
                mode="lines+markers",
                name=dataset_name
            ))

    # Update layout for the metric-specific comparison across datasets
    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets Using Baseline Response only",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()  # Show the plot

###**Plot all 4 types (contractnli, cuad, maud, privacy) and all 3 response types (Cot, manual, etc) on same plot for each metric**

In [27]:
# Comparison Across Datasets (4 plots, one per metric, all domains and prompt types shown in same plot)
for metric in metrics:
    fig = go.Figure()

    for experiment_folder in experiment_folders:
        experiment_path = os.path.join(base_folder, experiment_folder)
        json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

        for json_file in json_files:
            json_file_path = os.path.join(experiment_path, json_file)

            with open(json_file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            dataset_name = json_file.replace("_gpt_evaluation_results_reorganized_by_type.json", "")

            # **Plot all 3 models in the same figure**
            for model, line_style in models.items():
                y_values = []
                for k in k_values:
                    if metric in data[model][k]:
                        y_values.append(data[model][k][metric])
                    else:
                        y_values.append(eval(data[model][k]["ragas_metrics"])[metric])

                fig.add_trace(go.Scatter(
                    x=k_values, y=y_values,
                    mode="lines+markers",
                    line=dict(dash=line_style),
                    name=f"{dataset_name} - {model}"
                ))

    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()

###**Plotting the average scores (across domains) for each metric**

In [8]:
# Store averaged results
dataset_averages = {}

# **🔹 Compute Average Scores Per Dataset Type**
for experiment_folder in experiment_folders:
    experiment_path = os.path.join(base_folder, experiment_folder)
    json_files = [f for f in os.listdir(experiment_path) if f.endswith(".json")]

    dataset_name = experiment_folder  # Assume folder name represents dataset type

    # Initialize dictionary for averaging
    dataset_averages[dataset_name] = {model: {k: {metric: [] for metric in metrics} for k in k_values} for model in models}

    for json_file in json_files:
        json_file_path = os.path.join(experiment_path, json_file)

        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        for model in models:
            for k in k_values:
                for metric in metrics:
                    if metric in data[model][k]:
                        dataset_averages[dataset_name][model][k][metric].append(data[model][k][metric])
                    else:
                        dataset_averages[dataset_name][model][k][metric].append(eval(data[model][k]["ragas_metrics"])[metric])

# Compute the mean values
for dataset_name in dataset_averages:
    for model in models:
        for k in k_values:
            for metric in metrics:
                dataset_averages[dataset_name][model][k][metric] = np.mean(dataset_averages[dataset_name][model][k][metric])


# Comparison of average scores across domains by metric
for metric in metrics:
    fig = go.Figure()

    for dataset_name in dataset_averages:
        for model, line_style in models.items():
            y_values = [dataset_averages[dataset_name][model][k][metric] for k in k_values]

            fig.add_trace(go.Scatter(
                x=k_values, y=y_values,
                mode="lines+markers",
                line=dict(dash=line_style),
                name=f"{dataset_name} - {model}"
            ))

    fig.update_layout(
        title=f"Comparison of {metric.capitalize()} Across Datasets",
        xaxis_title="K value",
        yaxis_title=metric.capitalize(),
        hovermode="x unified"
    )

    fig.show()

In [19]:
# Use your 4 metrics
metrics = ["faithfulness", "bert_f1", "answer_relevancy", "rouge_recall"]
metric_titles = {
    "faithfulness": "Faithfulness",
    "bert_f1": "BERT F1 Score",
    "answer_relevancy": "Answer Relevancy",
    "rouge_recall": "ROUGE_Recall"
}

# Subplot positions
metric_pos = {
    "faithfulness": (1, 1),
    "bert_f1": (1, 2),
    "answer_relevancy": (2, 1),
    "rouge_recall": (2, 2)
}

# Create 2x2 subplot figure
fig = make_subplots(
    rows=2, cols=2,
    horizontal_spacing=0.1,
    vertical_spacing=0.15,
    subplot_titles=[metric_titles[m] for m in metrics]
)

# Color palette
colors = itertools.cycle([
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
    "#9467bd", "#8c564b", "#e377c2", "#7f7f7f"
])

# Consistent colors and line styles
color_map = {}
legend_tracker = set()

if "manually_written" in dataset_averages:
    dataset_averages["custom_crafted_prompt"] = dataset_averages.pop("manually_written")


display_model_map = {
    "manually_written": "custom_crafted_prompt"
}

for dataset_name in dataset_averages:
    display_dataset_name = display_name_map.get(dataset_name, dataset_name)

    for model in models:
        display_model = display_model_map.get(model, model)
        legend_label = f"{display_dataset_name} - {display_model}"

        if legend_label not in color_map:
            color_map[legend_label] = next(colors)

        for metric in metrics:
            row, col = metric_pos[metric]
            y_values = [dataset_averages[dataset_name][model][k][metric] for k in k_values]

            show_legend = legend_label not in legend_tracker

            fig.add_trace(
                go.Scatter(
                    x=k_values,
                    y=y_values,
                    mode="lines+markers",
                    name=legend_label,
                    line=dict(color=color_map[legend_label]),
                    showlegend=show_legend
                ),
                row=row,
                col=col
            )

            legend_tracker.add(legend_label)


# Update layout
fig.update_layout(
    height=800,
    width=1000,
    title=None,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.25,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=40, b=100)
)

fig.show()
