In [1]:
import json
import pandas as pd
import plotly.express as px
import os
import numpy as np
import scipy.stats
import re



In [2]:
def read_file(path):
    with open(path, "r") as f:
        data = f.read()
    return json.loads(data)["results"]


def short_name(name):
    if name == "gpt-3.5-turbo-1106":
        return "gpt-3.5-turbo"
    if "/" in name:
        name = name.split("/")[1]
    for k in ["-Instruct", "-preview"]:
        if k in name:
            name = name.split(k)[0]
    return name.lower()

def describe_df(df: pd.DataFrame, name="", print_max=10):
    print(f"Dataframe {name} with {df.shape[0]} rows and {df.shape[1]} columns:")
    maxlen = max(len(c) for c in df.columns) + 2
    for c in df.columns:
        a = np.array(df[c].unique())
        a.sort()
        if len(a) < print_max:
            print(f"{(c+':').ljust(maxlen)} {a}")
        else:
            print(f"{(c+':').ljust(maxlen)} {a[:10]}[...] ({len(a)} unique values)")


In [3]:
def load_and_process_files(file_paths):
    all_results = {}
    data = []
    data2 = []
    seen_keys = {}  # Track original keys and their source files

    for file_path in file_paths:
        with open(file_path) as f:
            results = json.loads(f.read())

        # Check for duplicate keys across files
        overlap = set(results.keys()) & set(all_results.keys())
        if overlap:
            raise ValueError(f"Duplicate keys found across files: {overlap}")

        all_results.update(results)

    for key, value in all_results.items():
        x = key.split("|")
        name, fromtype = x[0].split("---DESCRIPTION-")
        desc_model = x[1].split("---COMPARISON")[0]
        cmp_model = x[2]
        values = value["total_tallies"]
        desc_model = short_name(desc_model)
        desc_model = desc_model.replace("gpt3_5", "gpt-3.5-turbo").replace("gpt4", "gpt-4-1106")
        cmp_model = short_name(cmp_model)

        if cmp_model.startswith("Qwen1.5"):
            continue

        row = (name, fromtype, desc_model, cmp_model)
        data.append(row + ("human", values["Human"]))
        data.append(row + ("llm", values["LLM"]))
        data.append(row + ("invalid", values["Invalid"]))

        total = values["Human"] + values["LLM"]
        if total > 0:
            identifier = (desc_model, cmp_model, name, fromtype)
            if identifier in seen_keys:
                raise ValueError(
                    f"Duplicate combination found: {identifier}\n"
                    f"Key 1: {seen_keys[identifier]}\n"
                    f"Key 2: {key}"
                )
            seen_keys[identifier] = key
            data2.append(
                (
                    desc_model,
                    cmp_model,
                    name,
                    fromtype,
                    float(values["LLM"]) / total,
                    total,
                    values["LLM"],
                    key,
                )
            )

    df = pd.DataFrame(
        data, columns=["name", "ftype", "desc_model", "cmp_model", "rtype", "rvalue"]
    )
    df2 = pd.DataFrame(
        data2,
        columns=[
            "desc_model",
            "cmp_model",
            "name",
            "ftype",
            "value",
            "size",
            "llm_count",
            "original_key",
        ],
    )

    return df, df2


MERGED_FILES = ["merged_run_outputs/merged_llms.json", "merged_run_outputs/merged_humans_product-only_totals.json", "merged_run_outputs/merged_humans_movies.json"]
df, df2 = load_and_process_files(MERGED_FILES)

# delete everything that contains ftype prorgram_listing
df = df[~df.ftype.str.contains("listing")]
df2 = df2[~df2.ftype.str.contains("listing")]

### Now there are some duplicities, we need to combine them: sum size and llm_count, recompute value as llm_count / size
# df2 = df2.groupby(["desc_model", "cmp_model", "name", "ftype"]).agg({"size": "sum", "llm_count": "sum", "value": "first"}).reset_index()
# df2["value"] = df2["llm_count"] / df2["size"]


def compute_ci_and_p_level(row, null_hypothesis=0.5):
    alpha = row["llm_count"]
    beta = row["size"] - row["llm_count"]
    a, b = scipy.stats.beta.interval(0.95, alpha, beta)
    p_value = scipy.stats.binomtest(
        row["llm_count"], row["size"], p=null_hypothesis, alternative="two-sided"
    ).pvalue
    return row["value"] - a, b - row["value"], p_value


df2[["ci0", "ci1", "p_value"]] = df2.apply(
    lambda x: compute_ci_and_p_level(x), axis=1, result_type="expand"
)

OFFSET_NAMES = sorted(df2.cmp_model.unique())

df2["offset"] = df2.cmp_model.apply(
    lambda x: (OFFSET_NAMES.index(x) - len(OFFSET_NAMES) / 2) / (len(OFFSET_NAMES) * 2)
)


def make_title(x):
    if x["name"] == "product":
        if x["ftype"] == "from_json_details":
            return "product/details"
        elif x["ftype"] == "from_json_product_listing":
            return "product/listing"
        else:
            return "product/???"
    else:
        return x["name"]


df2["title"] = df2.apply(make_title, axis=1)

with np.printoptions(linewidth=120, precision=2):
    print("DF2:")
    print("size      is the number of VALID responses")
    print("llm_count is the number of preferences for LLM-generated responses")
    print(
        "value     is the fraction preference for LLM-generated alternatives, out of VALID"
    )
    print(
        "ci0, ci1  is the WIDTH of the 95% confidence interval LEFT and RIGHT of `value`"
    )
    print(
        "p_value   is the p-value for the null hypothesis that the discriminator is indifferent (p0=0.5, either side)"
    )
    print()
    describe_df(df2, "df2")
    print()
    describe_df(df, "df")
    print()

DF2:
size      is the number of VALID responses
llm_count is the number of preferences for LLM-generated responses
value     is the fraction preference for LLM-generated alternatives, out of VALID
ci0, ci1  is the WIDTH of the 95% confidence interval LEFT and RIGHT of `value`
p_value   is the p-value for the null hypothesis that the discriminator is indifferent (p0=0.5, either side)

Dataframe df2 with 81 rows and 13 columns:
desc_model:    ['gpt-3.5-turbo' 'gpt-4-1106' 'meta-llama-3.1-70b' 'mixtral-8x22b' 'qwen2.5-72b']
cmp_model:     ['gpt-3.5-turbo' 'gpt-4-1106' 'humans' 'meta-llama-3.1-70b' 'mixtral-8x22b' 'qwen2.5-72b']
name:          ['movie' 'paper' 'product']
ftype:         ['from_json_details' 'from_title_and_year' 'write_xml_paper_abstract_control_word_count']
value:         [0.28 0.29 0.31 0.36 0.42 0.46 0.47 0.48 0.48 0.49][...] (78 unique values)
size:          [ 75 143 152 154 163 166 167 170 172 174][...] (41 unique values)
llm_count:     [ 23  36  62  69  78  81  86  93

In [4]:
# Create a pivot table with cmp_model as rows and desc_model as columns
# First create a helper column combining name and ftype for products

df2_table = df2.copy()
df2_table['task'] = df2_table.apply(lambda x: 
    "mov" if x['name'] == 'movie' 
    else "pap" if x['name'] == 'paper'
    else 'p-d' if (x['name'] == 'product' and x['ftype'] == 'from_json_details')
    # else 'p-pl' if (x['name'] == 'product' and x['ftype'] == 'from_json_product_listing')
    else x['name'], axis=1)

# Add formatted column combining value and p-value
df2_table['formatted'] = df2_table.apply(lambda x: f"{x['value']:.3f} ({x['p_value']:.3f})", axis=1)
df2_table['formatted2'] = df2_table.apply(lambda x: f"{x['llm_count']} / {x['size']}", axis=1)

# Create pivot table
pivot = pd.pivot_table(df2_table, 
                      values='formatted',
                      index=['cmp_model'],
                      columns=['desc_model', 'task'],
                      aggfunc='sum',
                      fill_value="")
# Sort pivot2 with custom key to put "humans" first
pivot = pivot.reindex(sorted(pivot.index, key=lambda x: ("" if x == "humans" else x)))


pivot2 = pd.pivot_table(df2_table,
                      values='formatted2',
                      index=['cmp_model'],
                      columns=['desc_model', 'task'],
                      aggfunc='sum',
                      fill_value="")
# Sort pivot2 with custom key to put "humans" first
pivot2 = pivot2.reindex(sorted(pivot2.index, key=lambda x: ("" if x == "humans" else x)))

pd.set_option('display.max_rows', None, 'display.max_columns', None, 'display.precision', 3)

print("\nPivot table showing value and p-value for task counts:")

display(pivot)

print("\nPivot table showing absolute counts (preference-for-llm of total valid):")

display(pivot2)


Pivot table showing value and p-value for task counts:


desc_model,gpt-3.5-turbo,gpt-3.5-turbo,gpt-3.5-turbo,gpt-4-1106,gpt-4-1106,gpt-4-1106,meta-llama-3.1-70b,meta-llama-3.1-70b,meta-llama-3.1-70b,mixtral-8x22b,mixtral-8x22b,mixtral-8x22b,qwen2.5-72b,qwen2.5-72b,qwen2.5-72b
task,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap
cmp_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
humans,0.480 (0.818),0.280 (0.000),0.462 (0.254),0.307 (0.001),0.290 (0.000),0.601 (0.000),,,,,,,,,
gpt-3.5-turbo,0.665 (0.000),0.714 (0.000),0.483 (0.738),0.701 (0.000),0.928 (0.000),0.692 (0.000),0.656 (0.000),0.882 (0.000),0.506 (0.936),0.746 (0.000),0.627 (0.000),0.526 (0.573),0.602 (0.000),0.949 (0.000),0.528 (0.531)
gpt-4-1106,0.466 (0.140),0.644 (0.000),0.566 (0.123),0.703 (0.000),0.898 (0.000),0.759 (0.000),0.488 (0.622),0.795 (0.000),0.500 (1.000),0.701 (0.000),0.636 (0.000),0.605 (0.008),0.585 (0.000),0.931 (0.000),0.606 (0.007)
meta-llama-3.1-70b,0.538 (0.098),0.555 (0.121),0.627 (0.001),0.736 (0.000),0.768 (0.000),0.831 (0.000),0.554 (0.018),0.641 (0.000),0.559 (0.123),0.708 (0.000),0.591 (0.008),0.629 (0.001),0.663 (0.000),0.749 (0.000),0.654 (0.000)
mixtral-8x22b,0.693 (0.000),0.759 (0.000),0.706 (0.000),0.724 (0.000),0.950 (0.000),0.803 (0.000),0.646 (0.000),0.909 (0.000),0.577 (0.045),0.755 (0.000),0.764 (0.000),0.636 (0.000),0.664 (0.000),0.977 (0.000),0.610 (0.004)
qwen2.5-72b,0.359 (0.000),0.627 (0.000),0.488 (0.816),0.612 (0.000),0.891 (0.000),0.777 (0.000),0.420 (0.000),0.727 (0.000),0.500 (1.000),0.695 (0.000),0.600 (0.004),0.565 (0.091),0.694 (0.000),0.927 (0.000),0.589 (0.018)



Pivot table showing absolute counts (preference-for-llm of total valid):


desc_model,gpt-3.5-turbo,gpt-3.5-turbo,gpt-3.5-turbo,gpt-4-1106,gpt-4-1106,gpt-4-1106,meta-llama-3.1-70b,meta-llama-3.1-70b,meta-llama-3.1-70b,mixtral-8x22b,mixtral-8x22b,mixtral-8x22b,qwen2.5-72b,qwen2.5-72b,qwen2.5-72b
task,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap,mov,p-d,pap
cmp_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
humans,36 / 75,120 / 428,115 / 249,23 / 75,62 / 214,193 / 321,,,,,,,,,
gpt-3.5-turbo,254 / 382,150 / 210,69 / 143,251 / 358,167 / 180,429 / 620,254 / 387,194 / 220,78 / 154,279 / 374,138 / 220,81 / 154,236 / 392,204 / 215,86 / 163
gpt-4-1106,233 / 500,141 / 219,86 / 152,345 / 491,194 / 216,132 / 174,243 / 498,175 / 220,86 / 172,349 / 498,140 / 220,101 / 167,290 / 496,201 / 216,103 / 170
meta-llama-3.1-70b,269 / 500,122 / 220,104 / 166,368 / 500,169 / 220,152 / 183,277 / 500,141 / 220,104 / 186,354 / 500,130 / 220,117 / 186,329 / 496,164 / 219,121 / 185
mixtral-8x22b,323 / 466,167 / 220,115 / 163,315 / 435,209 / 220,147 / 183,292 / 452,200 / 220,105 / 182,342 / 453,168 / 220,117 / 184,302 / 455,215 / 220,111 / 182
qwen2.5-72b,179 / 498,138 / 220,81 / 166,306 / 500,196 / 220,143 / 184,210 / 500,160 / 220,93 / 186,347 / 499,132 / 220,105 / 186,347 / 500,204 / 220,109 / 185


In [5]:
# Format the pivot table data for LaTeX
def create_latex_table(
    df2_table,
    tasks=None,
    desc_models=None,
    cmp_models=None,
    with_common_defs=True,
    short_task_names=True,
):
    """
    Create a LaTeX table from the data with customizable filtering and formatting options.

    Args:
        df2_table: DataFrame with the source data
        tasks: List of task names to include (None = all)
        desc_models: List of description models to include (None = all)
        cmp_models: List of comparison models to include (None = all)
        with_common_defs: Whether to include color definitions
        short_task_names: Whether to use shortened task names
    """
    # Filter data if needed
    data = df2_table.copy()
    if tasks is not None:
        data = data[data["task"].isin(tasks)]
    if desc_models is not None:
        data = data[data["desc_model"].isin(desc_models)]
    if cmp_models is not None:
        data = data[data["cmp_model"].isin(cmp_models)]

    # Format items
    def fmt_item(value, p_value):
        if p_value >= 0.05:
            color = "\\textcolor{value-neutral}{"
            end_color = "}"
        elif value > 0.5:
            color = "\\textcolor{value-above}{"
            end_color = "}"
        else:
            color = "\\textcolor{value-below}{"
            end_color = "}"
        return f"{color}\\makecell[l]{{{value:.3f}\\\\{{\\footnotesize\\ \\ ({p_value:.3f})}}}}{end_color}"

    data["formatted"] = data.apply(lambda x: fmt_item(x["value"], x["p_value"]), axis=1)

    # Create pivot table
    latex_pivot = pd.pivot_table(
        data,
        values="formatted",
        index=["cmp_model"],
        columns=["desc_model", "task"],
        aggfunc="sum",
        fill_value="",
    )
    # Sort pivot2 with custom key to put "humans" first
    latex_pivot = latex_pivot.reindex(sorted(latex_pivot.index, key=lambda x: ("" if x == "humans" else x)))


    # Start building LaTeX string
    latex_str = ""
    if with_common_defs:
        latex_str = """
\\definecolor{value-above}{rgb}{0.7,0.1,0.1}
\\definecolor{value-below}{rgb}{0.1,0.1,0.7}
\\definecolor{value-neutral}{rgb}{0.3,0.3,0.3}

"""

    # Convert to LaTeX
    latex_str += latex_pivot.to_latex(
        column_format="l" + "p{1.0cm}" * len(latex_pivot.columns),
        multicolumn=True,
        multicolumn_format="c",
        header=True,
        bold_rows=True,
    )

    # Clean up LaTeX output
    latex_str = latex_str.replace("\\toprule", "\\hline")
    latex_str = latex_str.replace("\\midrule", "")
    latex_str = latex_str.replace("\\bottomrule", "\\hline")

    # Handle task names
    if short_task_names:
        replacements = {
            "mov": "{\\centering mov}",
            "p-d": "{\\centering p-d}",
            "p-pl": "{\\centering p-pl}",
            "pap": "{\\centering pap}",
        }
    else:
        replacements = {
            "mov": "{\\centering movie}",
            "p-d": "{\\centering prod. details}",
            "p-pl": "{\\centering prod. listing}",
            "pap": "{\\centering paper}",
        }

    for old, new in replacements.items():
        latex_str = latex_str.replace(old, new)

    # Add spacing between model groups
    model_groups = ["gpt-3.5-turbo", "gpt-4-1106", "meta-llama", "mixtral", "qwen"]
    for model in model_groups:
        latex_str = latex_str.replace(f"& {model}", f"\\\\[0.5em] & {model}")
    latex_str = latex_str.replace("meta-", "")

    # Replace headers
    latex_str = latex_str.replace("desc_model", "\\textbf{Generator}")
    latex_str = latex_str.replace("task", "\\textbf{Task}")
    latex_str = re.sub(r"cmp_model.*", r"\\\\[-2ex]\\hline\\\\[-2ex]", latex_str)
    return latex_str

print(create_latex_table(df2_table, tasks=["mov", "pap"], short_task_names=False))
print()
print(create_latex_table(df2_table, tasks=["p-d", "p-pl"], with_common_defs=False, short_task_names=False))


\definecolor{value-above}{rgb}{0.7,0.1,0.1}
\definecolor{value-below}{rgb}{0.1,0.1,0.7}
\definecolor{value-neutral}{rgb}{0.3,0.3,0.3}

\begin{tabular}{lp{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}p{1.0cm}}
\hline
\textbf{Generator} & \multicolumn{2}{c}{gpt-3.5-turbo} & \multicolumn{2}{c}{gpt-4-1106} & \multicolumn{2}{c}{llama-3.1-70b} & \multicolumn{2}{c}{mixtral-8x22b} & \multicolumn{2}{c}{qwen2.5-72b} \\
\textbf{Task} & {\centering movie} & {\centering paper} & {\centering movie} & {\centering paper} & {\centering movie} & {\centering paper} & {\centering movie} & {\centering paper} & {\centering movie} & {\centering paper} \\
\\[-2ex]\hline\\[-2ex]

\textbf{humans} & \textcolor{value-neutral}{\makecell[l]{0.480\\{\footnotesize\ \ (0.818)}}} & \textcolor{value-neutral}{\makecell[l]{0.462\\{\footnotesize\ \ (0.254)}}} & \textcolor{value-below}{\makecell[l]{0.307\\{\footnotesize\ \ (0.001)}}} & \textcolor{value-above}{\makecell[l]{0.601\\{\footnotesize\ \ (

In [6]:
s = df2.groupby(["name", "ftype", "desc_model", "cmp_model"]).size()
s[s > 1]

Series([], dtype: int64)

In [7]:
d = df2.groupby(["desc_model", "cmp_model", "title"], as_index=False)["size"].sum()
d

d = d[~d.title.str.contains("listing")]
d

Unnamed: 0,desc_model,cmp_model,title,size
0,gpt-3.5-turbo,gpt-3.5-turbo,movie,382
1,gpt-3.5-turbo,gpt-3.5-turbo,paper,143
2,gpt-3.5-turbo,gpt-3.5-turbo,product/details,210
3,gpt-3.5-turbo,gpt-4-1106,movie,500
4,gpt-3.5-turbo,gpt-4-1106,paper,152
5,gpt-3.5-turbo,gpt-4-1106,product/details,219
6,gpt-3.5-turbo,humans,movie,75
7,gpt-3.5-turbo,humans,paper,249
8,gpt-3.5-turbo,humans,product/details,428
9,gpt-3.5-turbo,meta-llama-3.1-70b,movie,500


In [9]:
for name, ftype in list(df[["name", "ftype"]].value_counts().index):
    print(name, "/", ftype)
    d = df2[(df2.ftype == ftype) & (df2.name == name)]
    d = d.groupby(["desc_model", "cmp_model"], as_index=False).agg({"value": "mean"})

    # Create pivot table
    tab = d.pivot(index="desc_model", columns="cmp_model", values="value")

    # Reorder columns to move 'humans' to the left
    if "humans" in tab.columns:
        columns = ["humans"] + [col for col in tab.columns if col != "humans"]
        tab = tab[columns]

    # Generate heatmap
    fig = px.imshow(
        tab,
        text_auto=".2f",
        color_continuous_scale="RdBu_r",
        height=700,
        color_continuous_midpoint=0.5,
    )
    fig.update_layout(
        plot_bgcolor="rgba(0, 0, 0, 0)", title=f"Heatmap for {name} / {ftype}"
    )
    fig.show()
    # save as pdf
    fig.write_image(f"visualizations/{name}_{ftype}_heatmap.pdf")

movie / from_title_and_year


FileNotFoundError: [Errno 2] No such file or directory: 'output/movie_from_title_and_year_heatmap.pdf'

In [None]:
df3 = df2[(df2.name == "product") & (df2.ftype == "from_json_details")]
df3

In [None]:
# Ensure df2 is not None and has the required column
if df2 is None:
    raise ValueError("df2 is None. Make sure it's properly defined.")

if "title" not in df2.columns:
    raise ValueError("Column 'title' is missing from df2.")

df = df2.copy()

# Ensure title column has valid data
if df["title"].isnull().all():
    raise ValueError("All values in 'title' column are NaN. Check the dataset.")

etype_order = {etype: i for i, etype in enumerate(df["title"].unique())}

# Ensure etype_order is not empty
if not etype_order:
    raise ValueError("etype_order is empty. The 'title' column may have no valid data.")

df["title_numeric"] = df["title"].map(etype_order)
df["xindex"] = df["title_numeric"] + df["offset"]

# Sort values for better plotting
df.sort_values("cmp_model", inplace=True)

# Define symbol map
syms = ["circle", "square", "diamond", "x", "triangle-left", "triangle-up", "triangle-down", "star", "pentagon"]
symbol_map = {name: s for name, s in zip(OFFSET_NAMES, syms)}

# Create scatter plot
fig = px.scatter(
    df,
    y="value",
    x="xindex",
    error_y="ci1",
    error_y_minus="ci0",
    symbol="cmp_model",
    color="cmp_model",
    facet_col="desc_model",
    facet_col_wrap=2,
    width=1400,
    height=600,  # Adjust height for two subplots
    labels={"xindex": "Dataset & Prompts", "gen_name": "Generator"},
    symbol_map=symbol_map,
)

# Ensure trace filtering does not fail
filtered_traces = []
for trace in fig.data:
    if hasattr(trace, "meta") and isinstance(trace.meta, dict) and "facet_row" in trace.meta:
        if trace.meta["facet_row"] < 2:
            filtered_traces.append(trace)

fig.data = filtered_traces

# Update trace markers and axes
fig.update_traces(marker=dict(size=10))
fig.update_yaxes(range=[0, 1], dtick=0.25)
fig.update_xaxes(
    tickvals=list(etype_order.values()),
    ticktext=list(etype_order.keys())
)

# Add horizontal lines
fig.add_hline(0.5, line_width=0.5)
fig.add_hline(1, line_width=1)
fig.add_hline(0.25, line_width=0.5, line_dash="10px 10px")
fig.add_hline(0.75, line_width=0.5, line_dash="10px 10px")

# Update axes and layout
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=False)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=False)

# Clean facet annotations
fig.for_each_annotation(lambda a: a.update(text=a.text.replace("=", " ")))

fig.show()


In [None]:
def compute_ci(value, size):
    alpha = value * size
    beta_ = (1 - value) * size
    a, b = scipy.stats.beta.interval(0.95, alpha, beta_)
    return value - a, b - value 

In [None]:
df = df2.copy()
df["cmp_model"] = df.cmp_model.apply(lambda x: "Humans" if x == "Humans" else "LLM")
def f1(x):
    return (x["value"] * x["size"]).sum() / x["size"].sum()
g = df.groupby(["desc_model", "cmp_model", "title"])
df = pd.DataFrame({"value": g.apply(f1, include_groups=False), "size": g["size"].sum()}).reset_index()
df[["ci0", "ci1"]] = df2.apply(lambda x: compute_ci(x["value"], x["size"]), axis=1, result_type='expand')
df["offset"] = df.cmp_model.apply(lambda x: -0.1 if x == "Humans" else 0)
df.sort_values("cmp_model")
etype_order = {etype: i for i, etype in sorted(enumerate(df["title"].unique()))}
df["title_numeric"] = df["title"].map(etype_order)
df["xindex"] = df["title_numeric"] + df["offset"]
syms = ["circle", "square", "diamond", "x", "triangle-left", "triangle-up", "triangle-down", "star", "pentagon"]
symbol_map = { name: s for name, s in zip(OFFSET_NAMES, syms) }
fig = px.scatter(df, y="value", x="xindex", error_y="ci1", error_y_minus="ci0", symbol="cmp_model", color="cmp_model", facet_col="desc_model", width=1400, height=400, labels={"xindex": "Dataset & Prompts", "gen_name": "Generator"}, symbol_map=symbol_map)
data = list(fig.data)
data.sort(key=lambda x: x.name)
fig.data = tuple(data)
fig.data
fig.update_traces(marker=dict(size=10))
fig.update_yaxes(range=[0, 1], dtick = 0.25)
fig.update_layout(
    plot_bgcolor="rgba(0, 0, 0, 0)",
)

fig.update_xaxes(
    tickvals=list(etype_order.values()),
    ticktext=list(etype_order.keys()))
fig.add_hline(0.5, line_width=0.5)
fig.add_hline(1, line_width=1)
fig.add_hline(0.25, line_width=0.5, line_dash="10px 10px")
fig.add_hline(0.75, line_width=0.5, line_dash="10px 10px")

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=False)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=False)
fig.for_each_annotation(lambda a: a.update(text=a.text.replace("=", " ")))
#fig.update_layout(legend={"xanchor": "center", "x": 0.5, "y": 1.0})

In [None]:
df = df2.copy()
df["cmp_model"] = df.cmp_model.apply(lambda x: "Humans" if x == "Humans" else "LLM")
def f1(x):
    return (x["value"] * x["size"]).sum() / x["size"].sum()
g = df.groupby(["cmp_model", "title"])
df = pd.DataFrame({"value": g.apply(f1, include_groups=False), "size": g["size"].sum()}).reset_index()
df[["ci0", "ci1"]] = df2.apply(lambda x: compute_ci(x["value"], x["size"]), axis=1, result_type='expand')
df["offset"] = df.cmp_model.apply(lambda x: -0.1 if x == "Humans" else 0)
df.sort_values("cmp_model")
etype_order = {etype: i for i, etype in sorted(enumerate(df["title"].unique()))}
df["title_numeric"] = df["title"].map(etype_order)
df["xindex"] = df["title_numeric"] + df["offset"]
syms = ["circle", "square", "diamond", "x", "triangle-left", "triangle-up", "triangle-down", "star", "pentagon"]
symbol_map = { name: s for name, s in zip(OFFSET_NAMES, syms) }
fig = px.scatter(df, y="value", x="xindex", error_y="ci1", error_y_minus="ci0", symbol="cmp_model", color="cmp_model", width=500, height=400, labels={"xindex": "Dataset & Prompts", "gen_name": "Generator"}, symbol_map=symbol_map)
data = list(fig.data)
data.sort(key=lambda x: x.name)
fig.data = tuple(data)
fig.data
fig.update_traces(marker=dict(size=10))
fig.update_yaxes(range=[0, 1], dtick = 0.25)
fig.update_layout(
    plot_bgcolor="rgba(0, 0, 0, 0)",
)

fig.update_xaxes(
    tickvals=list(etype_order.values()),
    ticktext=list(etype_order.keys()))
fig.add_hline(0.5, line_width=0.5)
fig.add_hline(1, line_width=1)
fig.add_hline(0.25, line_width=0.5, line_dash="10px 10px")
fig.add_hline(0.75, line_width=0.5, line_dash="10px 10px")

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=False)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=False)
fig.for_each_annotation(lambda a: a.update(text=a.text.replace("=", " ")))
#fig.update_layout(legend={"xanchor": "center", "x": 0.5, "y": 1.0})

In [None]:
df.offset