# Analysis for Prompting Evals

In [None]:
import json
import os

# import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import pandas as pd
import pyrootutils
import regex as re
import seaborn as sns
import sklearn.metrics as sk_metrics
import statsmodels.api as sm

In [None]:
YES_RE = re.compile(r"[^a-zA-Z]*\b(yes|no)\b[^a-zA-Z]*", re.IGNORECASE)


def extract_content(choices_list: list) -> str:
    try:
        return choices_list[0]["message"]["content"]
    except Exception as e:
        print(choices_list)


def extract_prediction(response: str) -> str:
    # get a list of all matches to YES_RE in `response`; take the last match
    # and check if it is a "yes" or "no" response

    matches = YES_RE.findall(response)
    if len(matches) == 0:
        return "unknown"
    else:
        last_match = matches[-1]
        if last_match.lower() == "yes":
            return "positive"
        else:
            return "negative"

In [None]:
PROJECT_ROOT = pyrootutils.find_root(
    search_from=os.path.abspath(""), indicator=".project-root"
)

GRAMMARS_DIR = PROJECT_ROOT / "data" / "grammars"

inputs_file_pattern = "*_inputs.jsonl"
results_file_pattern = "*_results.jsonl"

batch_id_re = re.compile(r"^(batch_\w+)_")

# find all input files in subdirectories of GRAMMARS_DIR
input_files = list(GRAMMARS_DIR.rglob(inputs_file_pattern))
results_files = list(GRAMMARS_DIR.rglob(results_file_pattern))

input_dfs = []

inputs_dfs = []
for f in input_files:
    i_df = pd.read_json(f, lines=True)
    i_json_struct = json.loads(i_df.to_json(orient="records"))
    i_flat_df = pd.json_normalize(i_json_struct)
    batch_id = batch_id_re.search(f.name).group(1)
    i_flat_df["batch_id"] = batch_id
    inputs_dfs.append(i_flat_df)
inputs_df = pd.concat(inputs_dfs, ignore_index=True)

del i_df, i_json_struct, i_flat_df, inputs_dfs

results_dfs = []
for f in results_files:
    r_df = pd.read_json(f, lines=True)
    r_json_struct = json.loads(r_df.to_json(orient="records"))
    r_flat_df = pd.json_normalize(r_json_struct)
    batch_id = batch_id_re.search(f.name).group(1)
    r_flat_df["batch_id"] = batch_id
    results_dfs.append(r_flat_df)
results_df = pd.concat(results_dfs, ignore_index=True)

del r_df, r_json_struct, r_flat_df, results_dfs

# Merge inputs and results on the the batch_id and custom_id
response_full_df = results_df.merge(
    inputs_df[
        [
            "custom_id",
            "batch_id",
            "body.metadata.sample_type",  # ground-truth label for sample
            "body.metadata.sample",  # the sample itself
            "body.metadata.grammar_file",  # grammar file used
            "body.metadata.model",  # model used
            "body.metadata.n_shots",  # n_shots used
        ]
    ],
    on=["batch_id", "custom_id"],
)

# del results_df, inputs_df

response_full_df = response_full_df.rename(
    columns={
        "body.metadata.sample_type": "sample.type.ground_truth",
        "body.metadata.sample": "sample",
        "body.metadata.grammar_file": "grammar_file",
        "body.metadata.model": "model",
        "body.metadata.n_shots": "n_shots",
    }
)
response_full_df = response_full_df.rename(
    columns={
        "body.metadata.sample_type": "sample.type.ground_truth",
        "body.metadata.sample": "sample",
        "body.metadata.grammar_file": "grammar_file",
        "body.metadata.model": "model",
        "body.metadata.n_shots": "n_shots",
    }
)
response_full_df["model_response"] = response_full_df["response.body.choices"].apply(
    extract_content
)

# Filter out batches with fewer than 500 samples
response_full_df = response_full_df[
    response_full_df.groupby("batch_id")["sample"].transform("count") >= 500
]

response_df = response_full_df[
    [
        "sample",
        "sample.type.ground_truth",
        "model_response",
        "grammar_file",
        "model",
        "n_shots",
    ]
].copy()
# del response_full_df
response_df["sample.type.predicted"] = response_df["model_response"].apply(
    extract_prediction
)
response_df["sample.length"] = response_df["sample"].apply(
    lambda s: len(str(s).split(" "))
)
response_df["correct"] = (
    response_df["sample.type.ground_truth"] == response_df["sample.type.predicted"]
)
response_df = response_df.dropna()
response_df["n_shots"] = pd.Categorical(
    response_df["n_shots"],
    categories=["0", "2", "4", "8", "16", "32"],
    ordered=True,
)
response_df["sample.type.ground_truth"] = pd.Categorical(
    response_df["sample.type.ground_truth"],
    categories=["positive", "negative"],
    ordered=True,
)
response_df["sample.type.predicted"] = pd.Categorical(
    response_df["sample.type.predicted"],
    categories=["positive", "negative", "unknown"],
    ordered=True,
)

response_df["model"] = response_df["model"].str.replace("_", "/", regex=False)
response_df["model"] = pd.Categorical(
    response_df["model"],
)


response_df.info()

In [None]:
# filter response_full_df to only include batch_id with at least 10 samples


response_full_df.groupby("batch_id")["sample"].count()

In [None]:
response_df.groupby(["grammar_file", "model"], observed=False)["sample"].count()

Load grammar and sample statistics, and annotate the F1 scores with those values.

In [None]:
grammar_stats_pattern = "grammar_stats.json"
samples_stats_pattern = "filtered_samples_stats.json"

grammar_stats_files = list(GRAMMARS_DIR.rglob(grammar_stats_pattern))
samples_stats_files = list(GRAMMARS_DIR.rglob(samples_stats_pattern))

grammar_stats_dicts = []
for f in grammar_stats_files:
    try:
        g_dict = json.loads(f.read_text())
        g_dict["grammar_file"] = f.parent.name
        grammar_stats_dicts.append(g_dict)
    except json.JSONDecodeError:
        print(f"Error reading {f}")
grammar_stats_df = pd.DataFrame(grammar_stats_dicts)

samples_stats_dicts = []
for f in samples_stats_files:
    try:
        s_dict = json.loads(f.read_text())
        s_dict["grammar_file"] = f.parent.name
        samples_stats_dicts.append(s_dict)
    except json.JSONDecodeError:
        print(f"Error reading {f}")
samples_stats_df = pd.DataFrame(samples_stats_dicts)

f1_df = (
    response_df.groupby(["n_shots", "model", "grammar_file"], observed=False)
    .apply(
        lambda group: sk_metrics.f1_score(
            group["sample.type.ground_truth"],
            group["sample.type.predicted"],
            average="weighted",
        ),
        include_groups=False,
    )
    .reset_index(name="f1_score")
)

f1_df = f1_df.join(
    grammar_stats_df.set_index("grammar_file"),
    on="grammar_file",
).join(
    samples_stats_df.set_index("grammar_file"),
    on="grammar_file",
)

del grammar_stats_df, samples_stats_df, grammar_stats_dicts, samples_stats_dicts

f1_df.info()

## Correlation Analysis

In [None]:
corr_mat = f1_df[
    [
        "f1_score",
        "n_shots",
        "n_terminals",
        "n_nonterminals",
        "n_lexical_productions",
        "n_nonlexical_productions",
        "compression_ratio",
        "mean_positive_parses",
        # "median_positive_parses",  # no variance
        "mean_positive_depth",
        "median_positive_depth",
        "total_samples",
    ]
].corr()

corr_mat

In [None]:
_ = sns.heatmap(
    corr_mat,
    cmap="vlag_r",
    vmin=-1,
    vmax=1,
)

In [None]:
ax = sns.heatmap(
    corr_mat.iloc[0].to_frame().sort_values(by="f1_score", ascending=False),
    cmap="vlag_r",
    vmin=-1,
    vmax=1,
)

_ = ax.set_title("Correlation with F1 Score")

In [None]:
(corr_mat.iloc[0].to_frame().sort_values(by="f1_score", ascending=False))

## Multivariate Regression

In [None]:
f1_stats_df = f1_df.copy().drop(
    columns=[
        # "coverage",
        # "total_samples",
        "total_possible_samples",
        "uncompressed_size",
        "compressed_size",
        "grammar_file",
        "grammar_name",
    ]
)
f1_stats_df["n_shots"] = f1_stats_df["n_shots"].astype(int)

f1_stats_df.info()

In [None]:
X = f1_stats_df[
    [
        "model",
        "n_shots",
        "n_terminals",
        "n_nonterminals",
        "n_lexical_productions",
        "n_nonlexical_productions",
        "compression_ratio",
        "mean_positive_parses",
        "median_positive_parses",
        "mean_positive_depth",
        "median_positive_depth",
        "coverage",
        "total_samples",
    ]
]
X = pd.get_dummies(X, drop_first=True, dtype=int)  # one-hot encode categorical vars
X = sm.add_constant(X)  # add a constant term to the model

Y = f1_stats_df["f1_score"]

model = sm.OLS(Y, X).fit()
print(model.summary())

## Accuracy by Model, Sample Type, and # of Shots

In [None]:
ax = sns.lineplot(
    data=response_df,
    x="n_shots",
    y="correct",
    hue="sample.type.ground_truth",
    palette={"positive": "orange", "negative": "blue"},
    style="model",
    markers=True,
    alpha=0.35,
    err_kws={"alpha": 0.15},
    markersize=8,
)

sns.lineplot(
    data=response_df,
    x="n_shots",
    y="correct",
    style="model",
    color="black",
    linewidth=2,
    markers=True,
    ax=ax,
    legend=False,
    markersize=8,
)

_ = ax.set_ylabel("Mean Accuracy")
_ = ax.set_xlabel("# of Shots  [log scale]")
_ = ax.set_ylim(-0.02, 1.02)

# baseline accuracy of guessing randomly is 50%
_ = ax.axhline(y=0.5, color="red", linestyle="--")

handles, labels = ax.get_legend_handles_labels()
for h in handles:
    h.set_alpha(1)
_ = ax.legend(handles, labels)

_ = sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## F1 Score

In [None]:
fig = plt.figure(figsize=(5, 3.5))
ax = fig.add_subplot(111)

_ = ax.axhline(y=0.5, color="red", linestyle="--", alpha=0.5)

sns.lineplot(
    data=f1_df,
    x="n_shots",
    y="f1_score",
    style="model",
    hue="model",
    markers=True,
    linewidth=2,
    markersize=8,
    ax=ax,
)

_ = ax.set_ylim(-0.02, 1.02)
_ = ax.set_xlabel("# of Shots [log scale]")
_ = ax.set_ylabel("F1 Score")

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## F1 Score by Complexity

In [None]:
fig = plt.figure(figsize=(5, 3.5))
ax = fig.add_subplot(111)

_ = ax.axhline(y=0.5, color="red", linestyle="--", alpha=0.5)

sns.lineplot(
    data=f1_df[f1_df["model"] == "gpt-4o-mini"],
    x="compression_ratio",
    y="f1_score",
    style="model",
    color="black",
    markers=True,
    linewidth=2,
    markersize=8,
    ax=ax,
)

_ = ax.set_ylim(-0.02, 1.02)
# _ = ax.set_xlim(0.98, None)
# _ = ax.set_xscale("log")
_ = ax.set_xlabel("gzip Compression Ratio")
_ = ax.set_ylabel("F1 Score")

In [None]:
fig = plt.figure(figsize=(5, 3.5))
ax = fig.add_subplot(111)

_ = ax.axhline(y=0.5, color="red", linestyle="--", alpha=0.5)

sns.scatterplot(
    data=f1_df[f1_df["model"] == "gpt-4o-mini"],
    x="n_terminals",
    y="f1_score",
    style="model",
    color="black",
    # markers=True,
    # linewidth=2,
    # markersize=8,
    ax=ax,
)

_ = ax.set_ylim(-0.02, 1.02)
_ = ax.set_xlabel("# of Terminals")
_ = ax.set_ylabel("F1 Score")

In [None]:
fig = plt.figure(figsize=(5, 3.5))
ax = fig.add_subplot(111)

_ = ax.axhline(y=0.5, color="red", linestyle="--", alpha=0.5)

sns.scatterplot(
    data=f1_df[f1_df["model"] == "gpt-4o-mini"],
    x="mean_positive_depth",
    y="f1_score",
    style="model",
    color="black",
    # markers=True,
    # linewidth=2,
    # markersize=8,
    legend=False,
    ax=ax,
)

_ = ax.set_ylim(-0.02, 1.02)
_ = ax.set_xlabel("Mean Parse Depth")
_ = ax.set_ylabel("F1 Score")
_ = ax.set_title("gpt-4o-mini")

## Histogram of Sample Lengths

In [None]:
fig = plt.figure(figsize=(6, 3))
ax = fig.add_subplot(111)

sns.histplot(
    data=response_df,
    x="sample.length",
    ax=ax,
    binwidth=1,
    hue="sample.type.ground_truth",
    palette={"positive": "orange", "negative": "blue"},
)

_ = ax.get_legend().set_title("Sample type")
_ = ax.set_xlabel("Sample length")