In [13]:
import pandas as pd
import numpy as np

# Load data

In [14]:
df_1 = pd.read_json("experiment_1_run_1.json")
df_2 = pd.read_json("experiment_1_run_2.json")
df_3 = pd.read_json("experiment_1_run_3.json")
df_1["passed"] = (df_1["status"] == "passed").astype("int")
df_2["passed"] = (df_2["status"] == "passed").astype("int")
df_3["passed"] = (df_3["status"] == "passed").astype("int")

In [15]:
df = df_1.copy()
df["prompt"] = df['name'].str.strip().str[-1]
df = df.drop(columns=["_id", "__v", "instructions_prompt", "collection", "answer", "log", "messages", "number_of_response_messages", "average_message_length", "comment"])
df["passed"] = df_1["passed"] + df_2["passed"] + df_3["passed"]

In [16]:
df["passed"].value_counts()

passed
3    894
2     70
0     42
1     34
Name: count, dtype: int64

In [17]:
df.columns

Index(['name', 'model', 'visualization', 'question_item', 'task', 'question',
       'dataset', 'status', 'passed', 'prompt'],
      dtype='object')

# Aggregate Runs

## Which questions are answered wrong in each run?

In [18]:
df[df["passed"] == 0][["question_item", "visualization", "task", "model", "prompt", "question"]]

Unnamed: 0,question_item,visualization,task,model,prompt,question
149,7,Bar Chart,Find Extremum,gpt-4o-mini,3,In which country is the average internet speed...
160,8,Bar Chart,Determine Range,gpt-4o,1,What is the range of the average internet spee...
164,8,Bar Chart,Determine Range,gpt-4o,2,What is the range of the average internet spee...
172,8,Bar Chart,Determine Range,gpt-4o,4,What is the range of the average internet spee...
177,8,Bar Chart,Determine Range,gpt-4o-mini,5,What is the range of the average internet spee...
184,9,Bar Chart,Make Comparisons,gpt-4o,2,How many countries in Asia is the average inte...
188,9,Bar Chart,Make Comparisons,gpt-4o,3,How many countries in Asia is the average inte...
192,9,Bar Chart,Make Comparisons,gpt-4o,4,How many countries in Asia is the average inte...
262,51,Bubble Chart,Find Anomalies,o1,1,Which city's metro system does lie outside the...
264,51,Bubble Chart,Find Anomalies,gpt-4o,2,Which city's metro system does lie outside the...


## Which questions are answered inconsistently? (sometimes right, sometimes wrong)

In [19]:
with pd.option_context("display.max_rows", 1000):
    display(
        df[df["passed"].isin([1,2])][["question_item", "visualization", "task", "model", "prompt", "question"]]
    )

Unnamed: 0,question_item,visualization,task,model,prompt,question
49,18,100% Stacked Bar Chart,Make Comparisons,gpt-4o-mini,3,The percentage of silver medalists from Austra...
61,35,Area Chart,Retrieve Value,gpt-4o-mini,1,What was the average price of a pound of cofee...
141,7,Bar Chart,Find Extremum,gpt-4o-mini,1,In which country is the average internet speed...
144,7,Bar Chart,Find Extremum,gpt-4o,2,In which country is the average internet speed...
145,7,Bar Chart,Find Extremum,gpt-4o-mini,2,In which country is the average internet speed...
157,7,Bar Chart,Find Extremum,gpt-4o-mini,5,In which country is the average internet speed...
161,8,Bar Chart,Determine Range,gpt-4o-mini,1,What is the range of the average internet spee...
163,8,Bar Chart,Determine Range,o1-mini,1,What is the range of the average internet spee...
165,8,Bar Chart,Determine Range,gpt-4o-mini,2,What is the range of the average internet spee...
167,8,Bar Chart,Determine Range,o1-mini,2,What is the range of the average internet spee...


## apply correct passed value

In [20]:
df["passed"] = np.where(df["passed"] == 3, 1, 0)
df.passed.value_counts()

passed
1    894
0    146
Name: count, dtype: int64

# Analysis
## By Vis, Model and Prompt Version

In [21]:
vis = df.groupby(["visualization", "model", "prompt", "passed"]).size().unstack(fill_value=0)
vis.columns.name = None
vis["hit_rate"] = vis[1] / (vis[1] + vis[0])
vis = vis.reset_index()

vis.to_csv("df_vis.csv", index=False)
vis

Unnamed: 0,visualization,model,prompt,0,1,hit_rate
0,100% Stacked Bar Chart,gpt-4o,1,0,3,1.0
1,100% Stacked Bar Chart,gpt-4o,2,0,3,1.0
2,100% Stacked Bar Chart,gpt-4o,3,0,3,1.0
3,100% Stacked Bar Chart,gpt-4o,4,0,3,1.0
4,100% Stacked Bar Chart,gpt-4o,5,0,3,1.0
...,...,...,...,...,...,...
235,Treemap,o1-mini,1,0,3,1.0
236,Treemap,o1-mini,2,0,3,1.0
237,Treemap,o1-mini,3,0,3,1.0
238,Treemap,o1-mini,4,0,3,1.0


## By Task, Model and Prompt Version

In [22]:
task = df.groupby(["task", "model", "prompt", "passed"]).size().unstack(fill_value=0)
task.columns.name = None
task["hit_rate"] = task[1] / (task[1] + task[0])
task = task.reset_index()

task.to_csv("df_task.csv", index=False)
task

Unnamed: 0,task,model,prompt,0,1,hit_rate
0,Determine Range,gpt-4o,1,1,4,0.800000
1,Determine Range,gpt-4o,2,1,4,0.800000
2,Determine Range,gpt-4o,3,1,4,0.800000
3,Determine Range,gpt-4o,4,1,4,0.800000
4,Determine Range,gpt-4o,5,0,5,1.000000
...,...,...,...,...,...,...
155,Retrieve Value,o1-mini,1,1,12,0.923077
156,Retrieve Value,o1-mini,2,1,12,0.923077
157,Retrieve Value,o1-mini,3,1,12,0.923077
158,Retrieve Value,o1-mini,4,1,12,0.923077


## By Difficulty, Model and Prompt Version

In [23]:
difficulty_mapping = {
    "easy": [1,2,4,6,7,17,20,21,23,25,32,38,42,44,56,57,61],
    "moderate": [3,5,8,12,14,18,19,22,27,28,29,33,34,35,48,51,52,54,59],
    "hard": [9,10,11,15,16,24,31,36,37,40,41,45,46,47,49,53,55,60]
}
lookup_table = {question:difficulty for difficulty, questions in difficulty_mapping.items() for question in questions}

df["Difficulty"] = df.question_item.apply(lambda q: lookup_table[q])

In [24]:
difficulty = df.groupby(["Difficulty", "model", "prompt", "passed"]).size().unstack(fill_value=0)
difficulty.columns.name = None
difficulty["hit_rate"] = difficulty[1] / (difficulty[1] + difficulty[0])
difficulty = difficulty.reset_index()

difficulty.to_csv("df_difficulty.csv", index=False)
difficulty

Unnamed: 0,Difficulty,model,prompt,0,1,hit_rate
0,easy,gpt-4o,1,2,15,0.882353
1,easy,gpt-4o,2,3,14,0.823529
2,easy,gpt-4o,3,5,12,0.705882
3,easy,gpt-4o,4,3,14,0.823529
4,easy,gpt-4o,5,3,14,0.823529
5,easy,gpt-4o-mini,1,4,13,0.764706
6,easy,gpt-4o-mini,2,3,14,0.823529
7,easy,gpt-4o-mini,3,5,12,0.705882
8,easy,gpt-4o-mini,4,5,12,0.705882
9,easy,gpt-4o-mini,5,4,13,0.764706
