# Analysis for Prompting Evals

In [None]:
import json
import os

import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import pandas as pd
import pyrootutils
import regex as re
import seaborn as sns
import sklearn.metrics as sk_metrics

In [None]:
YES_RE = re.compile(r"[^a-zA-Z]*\b(yes|no)\b[^a-zA-Z]*", re.IGNORECASE)


def extract_content(choices_list: list) -> str:
    return choices_list[0]["message"]["content"]


def extract_prediction(response: str) -> str:
    # get a list of all matches to YES_RE in `response`; take the last match
    # and check if it is a "yes" or "no" response

    matches = YES_RE.findall(response)
    if len(matches) == 0:
        return "unknown"
    else:
        last_match = matches[-1]
        if last_match.lower() == "yes":
            return "positive"
        else:
            return "negative"

In [None]:
PROJECT_ROOT = pyrootutils.find_root(
    search_from=os.path.abspath(""), indicator=".project-root"
)

GRAMMARS_DIR = PROJECT_ROOT / "data" / "grammars"

inputs_file_pattern = "*_inputs.jsonl"
results_file_pattern = "*_results.jsonl"

batch_id_re = re.compile(r"^(batch_\w+)_")

# find all input files in subdirectories of GRAMMARS_DIR
input_files = list(GRAMMARS_DIR.rglob(inputs_file_pattern))
results_files = list(GRAMMARS_DIR.rglob(results_file_pattern))

input_dfs = []

inputs_dfs = []
for f in input_files:
    i_df = pd.read_json(f, lines=True)
    i_json_struct = json.loads(i_df.to_json(orient="records"))
    i_flat_df = pd.json_normalize(i_json_struct)
    batch_id = batch_id_re.search(f.name).group(1)
    i_flat_df["batch_id"] = batch_id
    inputs_dfs.append(i_flat_df)
inputs_df = pd.concat(inputs_dfs, ignore_index=True)

results_dfs = []
for f in results_files:
    r_df = pd.read_json(f, lines=True)
    r_json_struct = json.loads(r_df.to_json(orient="records"))
    r_flat_df = pd.json_normalize(r_json_struct)
    batch_id = batch_id_re.search(f.name).group(1)
    r_flat_df["batch_id"] = batch_id
    results_dfs.append(r_flat_df)
results_df = pd.concat(results_dfs, ignore_index=True)

# Merge inputs and results on the the batch_id and custom_id
response_full_df = results_df.merge(
    inputs_df[
        [
            "custom_id",
            "batch_id",
            "body.metadata.sample_type",  # ground-truth label for sample
            "body.metadata.sample",  # the sample itself
            "body.metadata.grammar_file",  # grammar file used
            "body.metadata.model",  # model used
            "body.metadata.n_shots",  # n_shots used
        ]
    ],
    on=["batch_id", "custom_id"],
)

response_full_df = response_full_df.rename(
    columns={
        "body.metadata.sample_type": "sample.type.ground_truth",
        "body.metadata.sample": "sample",
        "body.metadata.grammar_file": "grammar_file",
        "body.metadata.model": "model",
        "body.metadata.n_shots": "n_shots",
    }
)

response_full_df = response_full_df.rename(
    columns={
        "body.metadata.sample_type": "sample.type.ground_truth",
        "body.metadata.sample": "sample",
        "body.metadata.grammar_file": "grammar_file",
        "body.metadata.model": "model",
        "body.metadata.n_shots": "n_shots",
    }
)

response_full_df["model_response"] = response_full_df["response.body.choices"].apply(
    extract_content
)

response_df = response_full_df[
    [
        "sample",
        "sample.type.ground_truth",
        "model_response",
        "grammar_file",
        "model",
        "n_shots",
    ]
].copy()


response_df["sample.type.predicted"] = response_df["model_response"].apply(
    extract_prediction
)

response_df["sample.length"] = response_df["sample"].apply(
    lambda s: len(str(s).split(" "))
)

response_df["correct"] = (
    response_df["sample.type.ground_truth"] == response_df["sample.type.predicted"]
)

response_df = response_df.dropna()

response_df["n_shots"] = pd.Categorical(
    response_df["n_shots"],
    categories=["0", "2", "4", "8", "16", "32"],
    ordered=True,
)
response_df["sample.type.ground_truth"] = pd.Categorical(
    response_df["sample.type.ground_truth"],
    categories=["positive", "negative"],
    ordered=True,
)
response_df["sample.type.predicted"] = pd.Categorical(
    response_df["sample.type.predicted"],
    categories=["positive", "negative", "unknown"],
    ordered=True,
)
response_df["model"] = pd.Categorical(
    response_df["model"],
)

In [None]:
response_df.info()

## Accuracy by Model, Sample Type, and # of Shots

In [None]:
ax = sns.lineplot(
    data=response_df,
    x="n_shots",
    y="correct",
    hue="sample.type.ground_truth",
    palette={"positive": "orange", "negative": "blue"},
    style="model",
    markers=True,
    alpha=0.35,
    err_kws={"alpha": 0.15},
    markersize=8,
)

sns.lineplot(
    data=response_df,
    x="n_shots",
    y="correct",
    style="model",
    color="black",
    linewidth=2,
    markers=True,
    ax=ax,
    legend=False,
    markersize=8,
)

_ = ax.set_ylabel("Mean Accuracy")
_ = ax.set_xlabel("# of Shots  [log scale]")

handles, labels = ax.get_legend_handles_labels()
new_labels = ["Positive samples", "Negative samples", None, "gpt-4o-mini", "o3-mini"]
for h in handles[1:]:
    h.set_alpha(1)
_ = ax.legend(handles[1:], new_labels)

_ = sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

In [None]:
for h in handles:
    print(h)

## Histogram of Sample Lengths

In [None]:
fig = plt.figure(figsize=(6, 3))

ax = fig.add_subplot(111)

sns.histplot(
    data=response_df,
    x="sample.length",
    ax=ax,
    binwidth=1,
    hue="sample.type.ground_truth",
    palette={"positive": "orange", "negative": "purple"},
)

_ = ax.get_legend().set_title("Sample type")

_ = ax.set_yscale("log")
_ = ax.set_xlabel("Sample length")