<a href="https://colab.research.google.com/github/kenyuisme/model-evaluation-notebook/blob/main/Prompt_Evaluation_Log.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import os
import pandas as pd

def initialize_registry_question(registry, question):
    """
    Ensures the directory for the registry and CSV for the question exist.
    """
    # Paths
    data_dir = "data"
    registry_path = os.path.join(data_dir, registry)
    question_path = os.path.join(registry_path, question) + ".csv"

    # Create directories if missing
    os.makedirs(registry_path, exist_ok=True)

    # Create CSV if missing
    if not os.path.isfile(question_path):
        df = pd.DataFrame(columns=[
            "Prompt Name", "Precision", "Recall", "F1", "Cohen's Kappa", "Notes"
        ])
        df.to_csv(question_path, index=False)
        print(f"Initialized new CSV at: {question_path}")
    else:
        print(f"CSV already exists: {question_path}")

    return question_path


In [20]:
def add_prompt_result(registry, question, prompt_name, metrics, notes=""):
    """
    Appends a single prompt result row directly to the CSV file.
    """
    question_path = initialize_registry_question(registry, question) + ".csv"

    # Create DataFrame for the new row
    new_row_df = pd.DataFrame([{
        "Prompt Name": prompt_name,
        "Precision": metrics.get("Precision", None),
        "Recall": metrics.get("Recall", None),
        "F1": metrics.get("F1", None),
        "Cohen's Kappa": metrics.get("Cohen's Kappa", None),
        "Notes": notes
    }])

    # Append row without loading full CSV
    new_row_df.to_csv(question_path, mode='a', index=False, header=False)
    print(f"Appended prompt result to: {question_path}")


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

def view_prompt_results(registry, question, color_thresholds=(0.6, 0.8)):
    """
    Displays a styled DataFrame of all prompt results with traffic light coloring.
    """
    question_path = initialize_registry_question(registry, question)
    df = pd.read_csv(question_path)

    def traffic_light(val):
        if pd.isnull(val):
            return ''
        elif val < color_thresholds[0]:
            return 'background-color: #FFCDD2'  # Red
        elif val < color_thresholds[1]:
            return 'background-color: #FFF9C4'  # Yellow
        else:
            return 'background-color: #C8E6C9'  # Green

    styled_df = df.style.map(traffic_light, subset=['Precision', 'Recall', 'F1', 'Cohen\'s Kappa']) \
                        .format("{:.2f}", subset=['Precision', 'Recall', 'F1', 'Cohen\'s Kappa'])
    display(styled_df)


In [19]:
import pandas as pd
from datetime import datetime

folder_path = 'registry_A'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)


# Define mock data as list of dicts
mock_data = [
    {
        "Prompt_Name": "Prompt_v1",
        "Precision": 0.85,
        "Recall": 0.90,
        "F1": 0.87,
        "Accuracy": 0.92,
        "Cohen's Kappa": 0.83,
        "Test_Date": "2025-07-16",
        "Notes": "Initial baseline prompt"
    },
    {
        "Prompt_Name": "Prompt_v2",
        "Precision": 0.88,
        "Recall": 0.85,
        "F1": 0.86,
        "Accuracy": 0.91,
        "Cohen's Kappa": 0.81,
        "Test_Date": "2025-07-17",
        "Notes": "Added more context in prompt"
    },
    {
        "Prompt_Name": "Prompt_v3",
        "Precision": 0.82,
        "Recall": 0.88,
        "F1": 0.85,
        "Accuracy": 0.89,
        "Cohen's Kappa": 0.79,
        "Test_Date": "2025-07-18",
        "Notes": "Tuned temperature for LLM"
    }
]

# Convert to DataFrame
mock_df = pd.DataFrame(mock_data)

# Save to CSV
mock_df.to_csv('data/registry_A/comorbidity_diabetes.csv', index=False)


In [9]:
registry = 'registry_A'
question = 'comorbidity_diabetes'

In [29]:
view_prompt_results(registry, question)

CSV already exists: data/registry_A/comorbidity_diabetes.csv


Unnamed: 0,Prompt_Name,Precision,Recall,F1,Accuracy,Cohen's Kappa,Test_Date,Notes
0,Prompt_v1,0.85,0.9,0.87,0.92,0.83,2025-07-16,Initial baseline prompt
1,Prompt_v2,0.88,0.85,0.86,0.91,0.81,2025-07-17,Added more context in prompt
2,Prompt_v3,0.82,0.88,0.85,0.89,0.79,2025-07-18,Tuned temperature for LLM
