In [1]:
# Copyright 2023 InstaDeep Ltd. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import neptune
from tabulate import tabulate

from visualise_utils import (get_unique_description)
import pandas as pd

# Set the chart type and dataset to plot
chart_type = "config_table" 

dataset_dfs = {}
datasets = ["medqa", "mmlu", "pubmedqa", "cosmosqa", "ciar", "gpqa"]
run_range = [2244, 2538] #[2244, 2410]  # All: [2244, 2410]  # Faithful runs: [2244, 2323]
sparse_legend = True

# Initialize Neptune
API_TOKEN = os.environ["TM_NEPTUNE_API_TOKEN"]
project = neptune.init_project(
    project="InstaDeep/debatellm",
    mode="read-only",
)

# ADD TRUEM- to the beginning of the run ids
full_run_ids = [f"TRUEM-{run_id}" for run_id in range(run_range[0], run_range[1] + 1)]
runs_table_df = project.fetch_runs_table(id=full_run_ids).to_pandas()

# Set the index based on the custom names
runs_table_df = runs_table_df.set_index("sys/id")

# Filter out all runs not initiated by k.tessara
runs_table_df = runs_table_df[runs_table_df["sys/owner"].isin(["k.tessera", "ap.smit"])]

# Filter out all runs that where less than 80% completed
runs_table_df = runs_table_df[runs_table_df["eval/percent_complete"] >= 100.0]

# Filter out all runs that are not from the current dataset
runs_table_df = runs_table_df[runs_table_df["config/dataset/eval_dataset"].isin(datasets)]

# Discard all keys with monitoring in their name.
runs_table_df = runs_table_df[
    [key for key in runs_table_df.keys() if "eval/" in key or "config/" in key]
]
# dataset_dfs[dataset] = runs_table_df


https://app.neptune.ai/InstaDeep/debatellm/


In [2]:
pd.set_option("display.precision", 2)

results = {"mmlu" : [], "pubmedqa" : [], "medqa" : []}
for unique_id in runs_table_df.index:
    
    score = runs_table_df["eval/score/total_acc"][unique_id]
    cost = runs_table_df["eval/total_cost"][unique_id]
    dataset = runs_table_df["config/dataset/eval_dataset"][unique_id]
    id = unique_id

    description = get_unique_description(
        runs_table_df, unique_id, include_prompts=True
    )

    examples = runs_table_df["config/system/use_few_shot_examples"][unique_id]
    incorrectly_parsed_Agent_0 = runs_table_df["eval/Agent_0/any_incorrectly_parsed_answer"][unique_id]
    
    # Round score and cost
    score = round(score, 2)
    cost = round(cost, 2)
    system_name, description = description.split(" - ", 1)
    if "single_agent" in description: 
        system_name = "Single Agent"

    if "1:1" in description:
        print("REMOVED: ", unique_id)
        continue

    #Remove the Improved Multi-Persona Experiments
    if 'TRUEM-2359' in unique_id:
        continue

    description, agent_prompt = description.split(", agent prompt: ", 1)
    description, debate_prompt = description.split(", debate prompt: ", 1)

    if not pd.isna(examples) and examples: 
        agent_prompt = agent_prompt +" + FS"

    if system_name == "Single Agent":
        debate_prompt = "-"

    if "PaLM" in description:
        agent = "PaLM"
    else:
        agent = "GPT3.5"

    results[dataset].append((system_name, debate_prompt, agent_prompt, description, agent, unique_id, score, cost, incorrectly_parsed_Agent_0))


REMOVED:  TRUEM-2475
REMOVED:  TRUEM-2474
REMOVED:  TRUEM-2384


In [3]:
[(key, len(results[key])) for key in results.keys()]


[('mmlu', 71), ('pubmedqa', 52), ('medqa', 55)]

In [4]:
results_df = {}
for dataset in datasets:
    column_names = ["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents", "NeptuneID", "Score (%s)" %dataset.upper(), "Cost \$ (%s)" %dataset.upper(), "Agent0: Incorrectly Parsed"]
    
    df = pd.DataFrame(results[dataset], columns = column_names).sort_values(["System Name", "Debate Prompt", "Agent Prompt", "Agents", "Score (%s)" %dataset.upper()])
    df["ID (%s)" % dataset.upper()] = df["NeptuneID"]
    df["IP (%s)" % dataset.upper()] = df["Agent0: Incorrectly Parsed"]
    
    df.drop(columns=["NeptuneID", "Agent0: Incorrectly Parsed"],inplace=True)
    results_df[dataset] = df.drop_duplicates(subset=["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents"], keep='last') # sorted in increasing score order, so this is the largest score
    print("Dataset = ", dataset, "Original shape = ", df.shape, "Shape after merging = ", results_df[dataset].shape)

Dataset =  medqa Original shape =  (55, 9) Shape after merging =  (50, 9)
Dataset =  mmlu Original shape =  (71, 9) Shape after merging =  (50, 9)
Dataset =  pubmedqa Original shape =  (52, 9) Shape after merging =  (50, 9)


KeyError: 'cosmosqa'

In [None]:
# results_df = {}
# for dataset in datasets:
#     column_names = ["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents", "NeptuneID", "Score (%s)" %dataset.upper(), "Cost \$ (%s)" %dataset.upper()]
    
#     df = pd.DataFrame(results[dataset], columns = column_names).sort_values(["System Name", "Debate Prompt", "Agent Prompt", "Agents", "Score (%s)" %dataset.upper()])
#     df["ID (%s)" % dataset.upper()] = df["NeptuneID"]
#     df.drop(columns=["NeptuneID"],inplace=True)
#     results_df[dataset] = df.drop_duplicates(subset=["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents"], keep='last') # sorted in increasing score order, so this is the largest score
#     print("Dataset = ", dataset, "Original shape = ", df.shape, "Shape after merging = ", results_df[dataset].shape)

In [None]:
temp = pd.merge(results_df["medqa"], results_df["pubmedqa"], how="outer", on=["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents"])
temp.shape

(50, 13)

In [None]:
merged = pd.merge(temp, results_df["mmlu"], how="outer", on=["System Name", "Debate Prompt", "Agent Prompt", "Config", "Agents"]).sort_values(["System Name", "Debate Prompt"])
temp.shape, results_df["mmlu"].shape, merged.shape

((50, 13), (50, 9), (50, 17))

In [None]:
merged[merged["System Name"] == "Multi-Persona"].head()

Unnamed: 0,System Name,Debate Prompt,Agent Prompt,Config,Agents,Score (MEDQA),Cost \$ (MEDQA),ID (MEDQA),IP (MEDQA),Score (PUBMEDQA),Cost \$ (PUBMEDQA),ID (PUBMEDQA),IP (PUBMEDQA),Score (MMLU),Cost \$ (MMLU),ID (MMLU),IP (MMLU)
22,Multi-Persona,tsinghua_ma_debate,angel,2 rounds max,GPT3.5,0.49,14.27,TRUEM-2320,,0.57,7.15,TRUEM-2402,,0.63,0.33,TRUEM-2489,
23,Multi-Persona,tsinghua_ma_debate,angel,4 rounds max,GPT3.5,0.5,14.7,TRUEM-2319,,0.6,9.52,TRUEM-2404,,0.67,0.33,TRUEM-2491,
24,Multi-Persona,tsinghua_ma_debate,angel,3 rounds max,GPT3.5,0.51,14.6,TRUEM-2318,,0.59,8.49,TRUEM-2403,,0.63,0.34,TRUEM-2490,


In [None]:
merged.to_csv("ip_file.csv")

In [None]:
merged = merged[['System Name', 'Debate Prompt', 'Agent Prompt', 'Config', 'Agents', 'Score (MEDQA)', 'Cost \$ (MEDQA)', 'ID (MEDQA)', 'Score (MMLU)', 'Cost \$ (MMLU)', 'ID (MMLU)', 'Score (PUBMEDQA)', 'Cost \$ (PUBMEDQA)', 'ID (PUBMEDQA)']]
# merged = merged[['System Name', 'Debate Prompt', 'Agent Prompt', 'Config', 'Agents', 'Score (MEDQA)', 'Cost \$ (MEDQA)', 'Score (MMLU)', 'Cost \$ (MMLU)', 'Score (PUBMEDQA)', 'Cost \$ (PUBMEDQA)']]
merged

Unnamed: 0,System Name,Debate Prompt,Agent Prompt,Config,Agents,Score (MEDQA),Cost \$ (MEDQA),ID (MEDQA),Score (MMLU),Cost \$ (MMLU),ID (MMLU),Score (PUBMEDQA),Cost \$ (PUBMEDQA),ID (PUBMEDQA)
0,ChatEval,chateval_ma_debate,cot,"3 rounds, one_by_one",GPT3.5,0.53,34.81,TRUEM-2441,0.7,0.73,TRUEM-2493,0.76,12.6,TRUEM-2406
1,ChatEval,chateval_ma_debate,cot,"2 rounds, simultaneous_talk",GPT3.5,0.54,19.9,TRUEM-2302,0.73,0.43,TRUEM-2494,0.74,7.27,TRUEM-2407
2,ChatEval,chateval_ma_debate,cot,"3 rounds, simultaneous_talk_with_summarizer",GPT3.5,0.55,36.78,TRUEM-2429,0.73,0.84,TRUEM-2497,0.77,14.01,TRUEM-2410
3,ChatEval,chateval_ma_debate,cot,"2 rounds, one_by_one",GPT3.5,0.55,20.55,TRUEM-2414,0.69,16.28,TRUEM-2362,0.76,7.64,TRUEM-2405
4,ChatEval,chateval_ma_debate,cot,"2 rounds, simultaneous_talk_with_summarizer",GPT3.5,0.55,22.58,TRUEM-2304,0.7,0.53,TRUEM-2496,0.72,8.76,TRUEM-2409
5,ChatEval,chateval_ma_debate,cot,"3 rounds, simultaneous_talk",GPT3.5,0.57,33.26,TRUEM-2303,0.83,0.72,TRUEM-2495,0.77,12.14,TRUEM-2408
6,Ensemble Refinement,er_debate,er_few_shot,"3:9 - ER, GPT",GPT3.5,0.53,53.57,TRUEM-2271,0.63,1.26,TRUEM-2469,0.72,21.0,TRUEM-2439
7,Ensemble Refinement,er_debate,er_few_shot,"3:1 - ER, GPT",GPT3.5,0.53,17.83,TRUEM-2269,0.67,0.42,TRUEM-2467,0.72,7.0,TRUEM-2437
8,Ensemble Refinement,er_debate,er_few_shot,"5:0 - self_consistency, GPT",GPT3.5,0.53,22.28,TRUEM-2261,0.7,0.53,TRUEM-2459,0.7,8.75,TRUEM-2430
9,Ensemble Refinement,er_debate,er_few_shot,"5:0 - self_consistency, PaLM",PaLM,0.46,6.36,TRUEM-2260,0.5,0.15,TRUEM-2458,0.76,2.5,TRUEM-2428


In [None]:
merged.to_csv("file.csv")

In [None]:
df = merged

In [None]:
debate_prompts = {
    "chateval_ma_debate":"CE MAD",
    "er_debate" : "ER MAD",
    "er_debate" : "ER MAD",
    "er_debate_cot" : "ER MAD CoT", 
    "tsinghua_ma_debate" : "MP MAD",
    "-" : "-",
    "google_ma_debate" : "SoM MAD"    
}
agent_prompts={"cot": "CoT",
               "er_few_shot" : "FS",
               "er_few_shot + FS" : "FS+EG",
               "er_cot": "CoT",
               "er_cot + FS": "FS-CoT",
               "angel" : "ANGEL+DEVIL",
               "cot": "CoT",
               "simple": "SIMPLE",
               "spp_original" : "SPP"
}

In [None]:
df["Debate Prompt"] = df["Debate Prompt"].map(lambda x: debate_prompts[x])
df["Agent Prompt"] = df["Agent Prompt"].map(lambda x: agent_prompts[x])


In [None]:
#df["Debate Config"] = 
df["Config"] = df["Config"].str.replace(", GPT","")\
            .str.replace("GPT","")\
            .str.replace(", PaLM","")\
            .str.replace("PaLM","")\
            .str.replace(" - ER","")\
            .str.replace("1:0 - single agent", "")\
            .str.replace("1:0 - single_agent", "")\
            .str.replace("5:0 - self_consistency", "self consistency: reasoning=5")\
            .str.replace("3:1", "reasoning=3, aggregation=1")\
            .str.replace("3:9", "reasoning=3, aggregation=9")\
            .str.replace("_", " ")
df.sort_values(by=["Debate Prompt", "Agent Prompt", "Agents"], inplace=True)
df

Unnamed: 0,System Name,Debate Prompt,Agent Prompt,Config,Agents,Score (MEDQA),Cost \$ (MEDQA),ID (MEDQA),Score (MMLU),Cost \$ (MMLU),ID (MMLU),Score (PUBMEDQA),Cost \$ (PUBMEDQA),ID (PUBMEDQA)
25,Single Agent,-,CoT,,GPT3.5,0.51,4.46,TRUEM-2245,0.65,3.82,TRUEM-2307,0.77,1.75,TRUEM-2389
27,Single Agent,-,CoT,,GPT3.5,0.49,4.46,TRUEM-2249,0.67,0.11,TRUEM-2447,0.74,1.75,TRUEM-2479
26,Single Agent,-,CoT,,PaLM,0.14,1.28,TRUEM-2248,0.3,0.03,TRUEM-2445,0.42,0.5,TRUEM-2388
28,Single Agent,-,CoT,,PaLM,0.17,1.27,TRUEM-2247,0.33,0.03,TRUEM-2442,0.38,0.5,TRUEM-2505
31,Single Agent,-,FS,,GPT3.5,0.53,4.46,TRUEM-2253,0.7,0.11,TRUEM-2451,0.71,1.75,TRUEM-2421
32,Single Agent,-,FS,,PaLM,0.46,1.27,TRUEM-2252,0.5,0.03,TRUEM-2450,0.75,0.5,TRUEM-2419
33,Single Agent,-,FS+EG,,GPT3.5,0.54,6.37,TRUEM-2255,0.8,0.11,TRUEM-2453,0.7,2.61,TRUEM-2423
34,Single Agent,-,FS+EG,,PaLM,0.47,1.91,TRUEM-2254,0.7,0.03,TRUEM-2452,0.65,0.86,TRUEM-2422
29,Single Agent,-,FS-CoT,,GPT3.5,0.56,6.37,TRUEM-2251,0.7,0.15,TRUEM-2449,0.79,2.6,TRUEM-2417
30,Single Agent,-,FS-CoT,,PaLM,0.49,1.91,TRUEM-2250,0.63,0.04,TRUEM-2448,0.71,0.86,TRUEM-2513


In [None]:
df = df[['System Name', 'Debate Prompt', 'Agent Prompt', 'Config', 'Agents', 'Score (MEDQA)', 'Cost \$ (MEDQA)', 'Score (MMLU)', 'Cost \$ (MMLU)', 'Score (PUBMEDQA)', 'Cost \$ (PUBMEDQA)']]

In [6]:
table = df.to_latex(float_format="{:0.2f}".format, index=False, longtable=True, sparsify=True, multirow=True)
print(table)

\begin{longtable}{lllllrrlr}
\toprule
System Name & Debate Prompt & Agent Prompt & Config & Agents & Score (PUBMEDQA) & Cost \$ (PUBMEDQA) & ID (PUBMEDQA) & IP (PUBMEDQA) \\
\midrule
\endfirsthead
\toprule
System Name & Debate Prompt & Agent Prompt & Config & Agents & Score (PUBMEDQA) & Cost \$ (PUBMEDQA) & ID (PUBMEDQA) & IP (PUBMEDQA) \\
\midrule
\endhead
\midrule
\multicolumn{9}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
ChatEval & chateval_ma_debate & cot & 2 rounds, simultaneous_talk_with_summarizer & GPT3.5 & 0.72 & 8.76 & TRUEM-2409 & 0.00 \\
ChatEval & chateval_ma_debate & cot & 2 rounds, simultaneous_talk & GPT3.5 & 0.74 & 7.27 & TRUEM-2407 & 0.00 \\
ChatEval & chateval_ma_debate & cot & 3 rounds, one_by_one & GPT3.5 & 0.76 & 12.60 & TRUEM-2406 & 0.00 \\
ChatEval & chateval_ma_debate & cot & 2 rounds, one_by_one & GPT3.5 & 0.76 & 7.64 & TRUEM-2405 & 0.00 \\
ChatEval & chateval_ma_debate & cot & 3 rounds, simultaneous_talk_with_summarizer & GPT3.5 