In [None]:
!pip install json5

In [None]:
import collections

import json5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from google.colab import drive
# from google.colab import userdata


drive.mount('/content/drive')

In [None]:
BASEDIR = '/content/drive/Shareddrives/Curie/benchmarks/public_release'
meta_results_path = f"{BASEDIR}/eval_results/up_to_date/meta_results.json"

# Tables

## Table 1

In [None]:
TASKS_2_METRICS = {
    ("dft", "extract_structure_data_1_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("dft", "extract_dft_metadata_1_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("dft", "write_code_for_paper_0_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("mpv", "mat_paper_to_property_1_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("hfd", "derivation_prompt"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("hfe", "extract_hamiltonian_0_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("qecc_65", "describe_code_in_paper"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("geo", "extract_dataset_from_geo_papers_0_shot"): [
        "rougeLsum",
        "bert_f1",
    ],
    ("biogr", "georeference_image_0_shot"): [
        "iou",
    ],
    ("pdb", "reconstruct_protein_amino_acid_sequence_0_shot"): [
        "identity_ratio",
    ],
}

all_results = json5.load(open(meta_results_path, "r"))
df = pd.json_normalize(all_results, sep="$$").transpose()

# Add separe columns for task, model, example, metric, and value
df["task"] = df.apply(lambda x: x.name.split("$$")[0], axis=1)
df["prompt"] = df.apply(lambda x: x.name.split("$$")[1], axis=1)
df["model"] = df.apply(lambda x: x.name.split("$$")[2], axis=1)
df["example"] = df.apply(lambda x: x.name.split("$$")[3], axis=1)
def extract_metric(x):
  """Extracts the metric from the name string."""
  parts = x.name.split("$$")
  if len(parts) > 4:
    return parts[4]
  else:
    return "xxxxx"
df["metric"] = df.apply(extract_metric, axis=1)
df["value"] = df[0]

# Update the names
df["task"] = df["task"].str.replace("mpve", "mpv")


df["task_prompt"] = df.apply(lambda x: "_".join([x.task, x.prompt]), axis=1)
df["task_prompt_metric"] = df.apply(
    lambda x: "_".join([x.task, x.prompt, x.metric]), axis=1
)

df_len = len(df)

# Delete extra tasks, prompts, and metrics
task_prompt_metric_keep = []
for (task, prompt), metrics in TASKS_2_METRICS.items():
  for metric in metrics:
    task_prompt_metric_keep.append("_".join([task, prompt, metric]))

df = df.drop(df[~df["task_prompt_metric"].isin(task_prompt_metric_keep)].index)
print(f"{df_len-len(df)} rows contain extra tasks and are dropped.")
df_len = len(df)

# Delete non numerical entries
df = df[pd.to_numeric(df['value'], errors='coerce').notnull()]
print(f'{df_len-len(df)} rows contain non numerical entries and are dropped.')
df_len = len(df)

print(f"The final dataframe has {len(df)} rows.")

grouped = (
    df.groupby(["model", "task", "metric"])["value"]
    .agg(["mean", "std"])
    .fillna(0)
)

In [None]:
def print_row(model):
  metrics_strings = []
  for task in [
      "dft",
      "mpv",
      "hfd",
      "hfe",
      "qecc_65",
      "geo",
  ]:
    for metric in ["rougeLsum", "bert_f1"]:
      try:
        val = grouped.loc[(model, task, metric), "mean"]
        metrics_strings.append(str(round(val, 2)))
      except:
        print(f'Missing metric: {metric}, task: {task}, model: {model}')

  try:
    val = grouped.loc[(model, 'biogr', "iou"), "mean"]
    metrics_strings.append(str(round(val, 2)))
  except:
    metrics_strings.append("-")
  try:
    val = grouped.loc[(model, 'pdb', "identity_ratio"), "mean"]
    metrics_strings.append(str(round(val, 2)))
  except:
    metrics_strings.append("-")

  return "& " + " & ".join(metrics_strings) + " \\\\\n"


# Begin LaTeX table
latex = """
\\begin{table*}[!t]

\centering

\small
\setlength{\\tabcolsep}{4pt}
\\resizebox{\\textwidth}{!}{\\begin{tabular}{l | c c | c c |c c | c c | c c | c c | c | c }

\\toprule

\multirow{2}{*}{\\bf Method} & \multicolumn{2}{c|}{\\bf \data\ DFT} & \multicolumn{2}{c|}{\\bf \data\ MPV} &
\multicolumn{2}{c|}{\\bf \data\ HFD} &
\multicolumn{2}{c|}{\\bf \data\ HFE} &
\multicolumn{2}{c|}{\\bf \data\ QECC} &
\multicolumn{2}{c|}{\\bf \data\ GEO} &
{\\bf \data\ BIOGR} &
{\\bf \data\ PDB}
\\\\

& R-L & B-F1 & R-L & B-F1 & R-L & B-F1 & R-L & B-F1 & R-L & B-F1 & R-L & B-F1 & IoU & ID_{r} \\\\

\midrule
\multicolumn{15}{c}{\\textit{Zero-shot Open Weight LLMs}} \\\\
\midrule

Mixtral % \cite{Mixtral}
"""
latex += print_row('mixtral-gcp')

latex += "Command-R$+$ %\cite{CommandR+} \n"
latex += print_row('command-r-plus')

latex += "LongLLaMa %\cite{LongLLaMa} \n"
latex += print_row('longllama')

latex += """\n
\\midrule
\multicolumn{15}{c}{\\textit{Zero-shot Closed Weight LLMs}} \\\\
\midrule


%Gemini 1.0 Pro \cite{team2023gemini1.0} \\
Gemini 1.0 Pro %\cite{team2023gemini}
"""
latex += print_row('gemini-1.0-pro')
latex += "GPT-4o %\cite{gpt4orelease} \n"
latex += print_row('gpt-4o')
latex += "Gemini 1.5 Pro %\cite{reid2024gemini1.5} \n"
latex += print_row('gemini-1.5-pro-latest')
latex += "Gemini 1.5 Flash %\cite{reid2024gemini1.5} \n"
latex += print_row('gemini-1.5-flash-latest')
latex += "Claude 3 (Opus) %\cite{claude3} \n"
latex += print_row('claude-3-opus-20240229')
latex += """
\\bottomrule
\end{tabular}}
\\vspace{-2mm}
\caption{\\textbf{Results comparing performance of all models on all tasks based on automated metrics} R-L: Rouge-L, and B-F1:BertScore-F1. The avg. performance of all 3 DFT tasks are reported under DFT. All models support a context length of 32k or more. BIOGR has multimodal inputs which is unsupported by the chosen open models. Blue highlights the highest values.}
\label{tab:main_results}
\\vspace{-3mm}
\end{table*}"""

print(latex)

## Table 2

In [None]:
TASKS_2_METRICS = {
    ("dft", "extract_structure_data_1_shot"): [
        "LMSim-F1",
        "LMSim-Pr",
        "LMSim-Re",
    ],
    ("dft", "extract_dft_metadata_1_shot"): [
        "LMSim-F1",
        "LMSim-Pr",
        "LMSim-Re",
    ],
    ("mpv", "mat_paper_to_property_1_shot"): [
        "LMSim-F1",
        "LMSim-Pr",
        "LMSim-Re",
    ],
    ("mpv", "mat_paper_to_property_1_shot_exclude_trivia"): [
        "LMSim-F1",
        "LMSim-Pr",
        "LMSim-Re",
    ],
    ("mpv", "mat_paper_to_property_1_shot_bandgap_refractive"): [
        "LMSim-F1",
        "LMSim-Pr",
        "LMSim-Re",
    ],
}

all_results = json5.load(open(meta_results_path, "r"))
df = pd.json_normalize(all_results, sep="$$").transpose()


# Add separe columns for task, model, example, metric, and value
df["task"] = df.apply(lambda x: x.name.split("$$")[0], axis=1)
df["prompt"] = df.apply(lambda x: x.name.split("$$")[1], axis=1)
df["model"] = df.apply(lambda x: x.name.split("$$")[2], axis=1)
df["example"] = df.apply(lambda x: x.name.split("$$")[3], axis=1)
df["metric"] = df.apply(extract_metric, axis=1)
df["value"] = df[0]

# # Update the namings
df["task"] = df["task"].str.replace("mpve", "mpv")


df["task_prompt"] = df.apply(lambda x: "_".join([x.task, x.prompt]), axis=1)
df["task_prompt_metric"] = df.apply(
    lambda x: "_".join([x.task, x.prompt, x.metric]), axis=1
)

df_len = len(df)

# Delete extra tasks, prompts, and metrics
task_prompt_metric_keep = []
for (task, prompt), metrics in TASKS_2_METRICS.items():
  for metric in metrics:
    task_prompt_metric_keep.append("_".join([task, prompt, metric]))

df = df.drop(df[~df["task_prompt_metric"].isin(task_prompt_metric_keep)].index)
print(f"{df_len-len(df)} rows contain extra tasks and are dropped.")
df_len = len(df)


print(f"The final dataframe has {len(df)} rows.")

grouped = (
    df.groupby(["model", "task_prompt", "metric"])["value"]
    .agg(["mean", "std"])
    .fillna(0)
)

In [None]:
# Generate a LaTeX table for precision, recall, and F1 score.

def print_row(model):
  metrics_strings = []
  for task_prompt in [
      "dft_extract_structure_data_1_shot",
      "dft_extract_dft_metadata_1_shot",
      "mpv_mat_paper_to_property_1_shot",
      "mpv_mat_paper_to_property_1_shot_exclude_trivia",
      "mpv_mat_paper_to_property_1_shot_bandgap_refractive",
  ]:
    for metric in ["LMSim-Pr", "LMSim-Re", "LMSim-F1"]:
      try:
        val = grouped.loc[(model, task_prompt, metric), "mean"]
        metrics_strings.append(str(round(100*val, 2)))
      except:
        print(f'Missing metric: {metric}, task_prompt: {task_prompt}, model: {model}')

  return "& " + " & ".join(metrics_strings) + " \\\\\n"


# Begin LaTeX table
latex = """
\\begin{table*}[!th]
\centering
\small
\setlength{\\tabcolsep}{4pt}
\\resizebox{\\textwidth}{!}{\\begin{tabular}{l | c c c | c c c |c c c | c c c | c c c }
\\toprule

\multirow{2}{*}{\\bf Model} &
\multicolumn{3}{c|}{\\bf \data\ DFT-S} &
\multicolumn{3}{c|}{\\bf \data\ DFT-P} &
\multicolumn{3}{c|}{\\bf \data\ MPV} &
\multicolumn{3}{c|}{\\bf \data\ MPV-non-trivial} &
\multicolumn{3}{c}{\\bf \data\ MPV-specific}
\\\\

& Pr. & Rec. & F1 & Pr. & Rec. & F1 & Pr. & Rec. & F1 & Pr. & Rec. & F1 & Pr. & Rec. & F1  \\\\

\midrule
\multicolumn{15}{c}{\\textit{Zero-shot Open Weight LLMs}} \\\\
\midrule
Mixtral %\cite{Mixtral}
"""
latex += print_row('mixtral-gcp')

latex += "Command-R$+$ %\cite{CommandR+} \n"
latex += print_row('command-r-plus')

latex += "LongLLaMa %\cite{LongLLaMa} \n"
latex += print_row('longllama')

latex += """\n
\\midrule
\multicolumn{15}{c}{\\textit{Zero-shot Closed Weight LLMs}} \\\\
\midrule


%Gemini 1.0 Pro \cite{team2023gemini1.0} \\
Gemini 1.0 Pro %\cite{team2023gemini}
"""
latex += print_row('gemini-1.0-pro')
latex += "GPT-4o %\cite{gpt4orelease} \n"
latex += print_row('gpt-4o')
latex += "Gemini 1.5 Pro %\cite{reid2024gemini1.5} \n"
latex += print_row('gemini-1.5-pro-latest')
latex += "Gemini 1.5 Flash %\cite{reid2024gemini1.5} \n"
latex += print_row('gemini-1.5-flash-latest')
latex += "Claude 3 (Opus) %\cite{claude3} \n"
latex += print_row('claude-3-opus-20240229')
latex += """
\\bottomrule
\end{tabular}}
\\vspace{-1.5mm}
\caption{\\textbf{Comparing performance using \lmsim.} On sub-tasks requiring exhaustive retrieval of information we use \lmsim \ based similarity to compute compute F1 scores for finer grained assessment on materials science. We also include 2 ablations for the MPV task where we ask the LLM to retrieve non-trivial or specific property values (refractive index and optical bandgap) for materials. Command-R$+$ responses on MPV papers were incomplete leading to invalid json dictionaries. We find the precision and recall values to match human evaluations on the MPV tasks for Gemini 1.5 pro and GPT-4o.}
\label{tab:matsci_results}
\\vspace{-2mm}
\end{table*}"""

print(latex)

# Figures

In [None]:
# Assuming that the first metric in the list of metric in TASKS_2_METRICS is
# the main score value

TASKS_2_METRICS = {
    ("dft", "extract_structure_data_1_shot"): ["LMSim-F1", "LMSim-Pr", "LMSim-Re", "rougeLsum", "bert_f1"],
    ("dft", "extract_dft_metadata_1_shot"): ["LMSim-F1", "LMSim-Pr", "LMSim-Re", "rougeLsum", "bert_f1"],
    ("dft", "write_code_for_paper_0_shot"): ["rougeLsum", "bert_f1"],
    ("mpv", "mat_paper_to_property_1_shot"): ["LMSim-F1", "LMSim-Pr", "LMSim-Re", "rougeLsum", "bert_f1"],
    ("hfd", "derivation_prompt"): ["rougeLsum", "bert_f1"],
    ("hfe", "extract_hamiltonian_0_shot"): ["rougeLsum", "bert_f1"],
    ("qecc_65", "describe_code_in_paper"): ["rougeLsum", "bert_f1"],
    ("geo", "extract_dataset_from_geo_papers_0_shot"): ["rougeLsum", "bert_f1"],
    ("biogr", "georeference_image_0_shot"): ["iou"],
    ("pdb", "reconstruct_protein_amino_acid_sequence_0_shot"): [
        "identity_ratio"
    ],
}
TASKS_2_TITLES = {
    "dft": "DFT",
    "mpv": "MPVE",
    "hfd": "HFD",
    "hfe": "HFE",
    "qecc_65": "QECC",
    "geo": "GEO",
    "biogr": "BIOGR",
    "pdb": "PDB",
}

GEO_BAD_LICENSE = [
    "4d5c098bd142b2356e5485f7e3786255aa636073",
    "40cb6b737064b0881c536512d61817dbe79a3da4",
    "bb871818b3e903ba70b5e90929a575cd018e0b2b",
    "cea27d393e1b9dcfab5f8f9f2fbd33e5bdd96e76",
    "e10fd24fe75f5c10d54698a0d141dc9151cb2535",
    "32ecb10ab170fa193ea879e1f63ce8ae5d7b9f34",
    "bfde3d73c1df8980a0c3915e627236ce818d42c7",
    "00f8c2660ea4795d25e8e801fc831bb9dcf64022",
    "5a7a518d77aee623be48bbee6538fdbd77c26238",
    "375bbe6ac4c3f3c16ddddaea2464f9f2e112e00a",
    "8349632fbb06bbce22012097f1030d1c53a8e57b",
    "467f0fdc420f5cd8996c0b2b1eb33a3dcda93c5e",
    "5c0e2e83c0d8d4e5d86ab77bea49c62ac77ab9e9",
]

In [None]:
# Load the meta_results.json and convert it to dataframe

all_results = json5.load(open(meta_results_path, 'r'))
df = pd.json_normalize(all_results, sep='$$').transpose()

# Add separe columns for task, model, example, metric, and value
df['task'] = df.apply(lambda x: x.name.split('$$')[0], axis=1)
df['prompt'] = df.apply(lambda x: x.name.split('$$')[1], axis=1)
df['model'] = df.apply(lambda x: x.name.split('$$')[2], axis=1)
df['example'] = df.apply(lambda x: x.name.split('$$')[3], axis=1)
df["metric"] = df.apply(extract_metric, axis=1)
df['value'] = df[0]

# Update the namings
df['task'] = df['task'].str.replace('mpve', 'mpv')
df['model'] = df['model'].str.replace('gpt-4o', 'GPT-4o')
df['model'] = df['model'].str.replace('claude-3-opus-20240229', 'Claude 3 (Opus)')
df['model'] = df['model'].str.replace('gemini-1.5-pro-latest', 'Gemini 1.5 Pro')
df['model'] = df['model'].str.replace('gemini-1.0-pro', 'Gemini 1.0 Pro')
df['model'] = df['model'].str.replace('gemini-1.5-flash-latest', 'Gemini 1.5 Flash')
df['model'] = df['model'].str.replace('longllama', 'LongLLaMA')
df['model'] = df['model'].str.replace('mixtral-gcp', 'Mixtral-8x7b')
df['model'] = df['model'].str.replace('command-r-plus', 'Command R+')

df['task_prompt'] = df.apply(lambda x: '_'.join([x.task, x.prompt]), axis=1)
df['task_prompt_metric'] = df.apply(lambda x: '_'.join([x.task, x.prompt, x.metric]), axis=1)

df_len = len(df)
print(f'The dataframe has \033[1m{df_len}\033[0m rows.')

# Delete extra tasks, prompts, and metrics
task_prompt_metric_keep = []
for (task, prompt), metrics in TASKS_2_METRICS.items():
  for metric in metrics:
    task_prompt_metric_keep.append('_'.join([task, prompt, metric]))

df = df.drop(df[~df['task_prompt_metric'].isin(task_prompt_metric_keep)].index)
print(f'\033[1m{df_len-len(df)}\033[0m rows contain extra tasks/prompts/metrics and are dropped.')
df_len = len(df)

# Delete non numerical entries
df = df[pd.to_numeric(df['value'], errors='coerce').notnull()]
print(f'\033[1m{df_len-len(df)}\033[0m rows contain non numerical entries and are dropped.')
df_len = len(df)

# Delete geo examples with license issues
df = df.drop(df[df['example'].isin(GEO_BAD_LICENSE)].index)
print(f'\033[1m{df_len-len(df)}\033[0m rows contain license issues and are dropped.')
df_len = len(df)

# Divide the RougeLsum by 100
df.loc[df['metric'] == 'rougeLsum', 'value'] /= 100

print(f'The final dataframe has \033[1m{len(df)}\033[0m rows.')

In [None]:
# Sanity check: Count the number of examples for each task
counts =df.groupby('task').count()

# Assert all the tasks in TASKS_2_METRICS exist in the df
assert set(counts.index.tolist()) == set([i[0] for i in TASKS_2_METRICS.keys()])

counts

In [None]:
# Sanity check: Check the values of unique metrics for each task

df.groupby('task')['metric'].unique()

## Figure 6: barplots for all tasks

In [None]:
model_order = [
    'Claude 3 (Opus)',
    'Gemini 1.5 Pro',
    'GPT-4o',
    'Gemini 1.5 Flash',
    'Gemini 1.0 Pro',
    'Command R+',
    'Mixtral-8x7b',
    'LongLLaMA',
]
palette = [
    '#D9886C',
    '#E7B7A0',
    '#B3D1DF',
    '#89A8C0',
    '#6DA78B',
    '#346A67',
    '#E0C085',
    '#C49B3C',
]

palette2 = [
    '#58508d',
    '#bc5090',
    '#ff6361',
    '#ffa600',
]

palette3 = [
    '#BA68C8',
    '#7986CB',
    '#B2DFDB',
]

TASKS_2_METRICS_FIGS = {
    "dft": "rougeLsum",
    "mpv": "rougeLsum",
    "hfd": "rougeLsum",
    "hfe": "rougeLsum",
    "qecc_65": "rougeLsum",
    "geo": "rougeLsum",
    "biogr": "iou",
    "pdb": "identity_ratio",
}

palette4 = [
    '#ffa600',
    '#58508d',
    '#bc5090',
    '#ff6361',
    '#7986CB',
    '#B2DFDB',
]

In [None]:
label_font_size = 16
tick_label_size = 14
legend_font_size = 14

def plot_metric(metric, df, ylim=None):
  sns.set_theme(style='whitegrid')
  df_metric = df[df['metric'] == metric]
  df_metric[metric] = df_metric['value']

  plt.figure(figsize=(20, 6))

  ax = sns.barplot(
      x='task',
      y=metric,
      estimator=pd.Series.mean,
      errorbar=('pi', 50),
      data=df_metric,
      hue='model',
      palette=palette,
      capsize=0.05,
      errwidth=0.75,
      hue_order=model_order,
      order=TASKS_2_TITLES.keys()
  )

  ax.set_xlabel('Task', fontsize=label_font_size)
  ax.set_ylabel('Score', fontsize=label_font_size)
  ax.tick_params(labelsize=tick_label_size)
  ax.set_xticklabels(TASKS_2_TITLES.values(), rotation=0)

  # Add hatch patterns to specific bars
  bars = ax.patches
  num_hatches = len(df['task'].unique())
  hatches = (
      ['o'] * num_hatches
      + [''] * num_hatches
      + [''] * num_hatches
      + [''] * num_hatches
      + ['//'] * num_hatches
      + [''] * num_hatches
      + ['\\'] * num_hatches
  )

  for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

  plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
  if ylim is not None:
    plt.ylim([0, ylim])

  plt.show()

def plot_score(df, ylim=None):
  sns.set_theme(style='whitegrid')

  df_list = []
  for task, metric in TASKS_2_METRICS_FIGS.items():
    df_list.append(df[(df['task'] == task) & (df['metric'] == metric)])
  df_score = pd.concat(df_list)
  df_score['Score'] = df_score['value']


  plt.figure(figsize=(20, 6))

  ax = sns.barplot(
      x='task',
      y='Score',
      estimator=pd.Series.mean,
      errorbar=('pi', 50),
      data=df_score,
      hue='model',
      palette=palette,
      capsize=0.05,
      errwidth=0.75,
      hue_order=model_order,
      order=TASKS_2_TITLES.keys()
  )

  ax.set_xlabel('Task', fontsize=label_font_size)
  ax.set_ylabel('Score', fontsize=label_font_size)
  ax.tick_params(labelsize=tick_label_size)
  ax.set_xticklabels(TASKS_2_TITLES.values(), rotation=0)

  # Add hatch patterns to specific bars
  bars = ax.patches
  num_hatches = len(df['task'].unique())
  hatches = (
      ['o'] * num_hatches
      + [''] * num_hatches
      + [''] * num_hatches
      + [''] * num_hatches
      + ['//'] * num_hatches
      + [''] * num_hatches
      + ['\\'] * num_hatches
  )

  for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

  plt.legend(loc='upper left', ncol=8)
  if ylim is not None:
    plt.ylim([0, ylim])

  plt.savefig('Figure_6.png', format='png', dpi=300)
  plt.show()

In [None]:
plot_score(df, 0.9)

## Figure 2: average over tasks

In [None]:
df_list = []
for task, metric in TASKS_2_METRICS_FIGS.items():
  df_list.append(df[(df['task'] == task) & (df['metric'] == metric)])
df_score = pd.concat(df_list)
df_score['Score'] = df_score['value']

df_mean = df_score.groupby(['model', 'task'])['Score'].agg(["mean"]).unstack()
df_mean['Average Score'] = df_mean.mean(axis=1)
df_mean['model'] = df_mean.index


In [None]:
# If a cell is NaN, we don't have any prediction for that domain with the model.
df_mean

In [None]:
ax = sns.barplot(
    x='Average Score',
    y='model',
    data=df_mean,
    palette=palette,
    order=model_order,
);

ax.set_ylabel('', fontsize=label_font_size)
ax.set_xlabel('Average Score', fontsize=label_font_size)
ax.tick_params(labelsize=tick_label_size)
plt.xlim([0, 0.3])
plt.xticks(fontsize=12)
plt.tight_layout()
plt.savefig('Figure_2.png', format='png', dpi=300)
plt.show()


## Figure 1: Compare Gemini 1.0 vs Gemini 1.5 on other benchmarks

In [None]:
# https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf

metrics = {
    1:{
        'Model': 'Gemini 1.0 Pro',
        'CURIE': float(df_mean[df_mean['model'] == 'Gemini 1.0 Pro'][
            'Average Score'
        ].values[0]) * 100,
        'DROP': 74.9,  # Variable shots,
        'GPQA': 27.9,  # 4-shot,
        'MMLU': 71.8,  # 5-shot,
    },
    2:{
        'Model': 'Gemini 1.5 Pro',
        'CURIE': float(df_mean[df_mean['model'] == 'Gemini 1.5 Pro'][
            'Average Score'
        ].values[0]) * 100,
        'DROP': 74.1,  # Variable shots,
        'GPQA': 46.2,  # 0-shot,
        'MMLU': 85.9,  # 5-shot,
    },
}

other_benchmarks = pd.DataFrame(metrics).transpose()

In [None]:
# https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf

metrics = {
    1:{
        'Model': 'Claude 3 (Opus)',
        'CURIE': float(df_mean[df_mean['model'] == 'Claude 3 (Opus)'][
            'Average Score'
        ].values[0]) * 100,
        'ZeroScrolls': 39.07,  # Variable shots,
        'GPQA': 50.4,  # 4-shot,
        'MMLU-pro': 76.12,  # 5-shot,
        'MathVista': 50.5,
        'RULER': 89,
    },
    2:{
        'Model': 'Gemini 1.5 Pro',
        'CURIE': float(df_mean[df_mean['model'] == 'Gemini 1.5 Pro'][
            'Average Score'
        ].values[0]) * 100,
        'ZeroScrolls': np.nan,
        'GPQA': 46.2,  # 4-shot,
        'MMLU-pro': 69.03,  # 5-shot,
        'MathVista': 63.9,
        'RULER': 95.5,
    },
    3:{
        'Model': 'GPT-4o',
        'CURIE': float(df_mean[df_mean['model'] == 'GPT-4o'][
            'Average Score'
        ].values[0]) * 100,
        'ZeroScrolls': 41.67,  # Variable shots,
        'GPQA': 50.4,  # 4-shot,
        'MMLU-pro': 76.12,  # 5-shot,
        'MathVista': 50.5,
        'RULER': np.nan
    },
}

other_benchmarks = pd.DataFrame(metrics).transpose()

In [None]:
df_mean['model']

In [None]:
other_benchmarks

In [None]:
other_benchmarks = other_benchmarks.melt(
    id_vars=['Model'], var_name='Benchmark', value_name='Score'
)

In [None]:
ax = sns.barplot(
    y='Score',
    hue='Benchmark',
    x='Model',
    data=other_benchmarks,
    palette=palette4,
    hue_order=['CURIE', 'ZeroScrolls', 'GPQA', 'MathVista', 'MMLU-pro', 'RULER'  ],
)

# Get the legend handles and labels
handles, labels = plt.gca().get_legend_handles_labels()

# Modify the legend labels
labels_info = {'RULER': 'Long Context', 'MMLU-pro': 'Understanding', 'MathVista': 'Reasoning in Visual Context', 'GPQA': 'Science Expertise', 'ZeroScrolls': "Long Context", 'CURIE':'Science + Long Context (ours)'}
labels = [f'{l}: {labels_info[l]}' for l in labels]

# Set the legend labels
plt.legend(handles, labels, loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.35),  borderaxespad=0.)
plt.xlabel('')

plt.tight_layout()

plt.savefig('Figure_1.png', format='png', dpi=300)
plt.show()

## Figure 8: separate results by difficulty levels

In [None]:
import json5
import pandas as pd

In [None]:
# Load the difficulty level json file

difficulty_json = json5.load(
    open(
        f"{BASEDIR}/data/difficulty_levels.json"
    )
)

df_lists = []
for task, difficulties in difficulty_json.items():
  df_diff = pd.DataFrame(difficulties, index=['difficulty']).transpose()
  df_diff['task'] = len(df_diff) * [task]
  df_diff['example'] = df_diff.index
  df_diff = df_diff.reset_index()
  df_diff = df_diff.drop(columns=['index'])
  df_lists.append(df_diff)

df_diff = pd.concat(df_lists)
df_diff = df_diff.replace('qecc_85', 'qecc_65')
df_diff = df_diff.replace('mpve', 'mpv')
df_diff['task_example'] = df_diff.apply(
    lambda x: x['task'] + '_' + x['example'], axis=1
)

In [None]:
# Add difficulty levels to the df_score

df_score['task_example'] = df_score.apply(
    lambda x: x['task'] + '_' + x['example'], axis=1
)
df_score = df_score.merge(df_diff, on=['task_example'], how='left')

# Quick check that they merged properly
collections.Counter(list(df_score['difficulty']))

In [None]:
# Average scores over all tasks, separated by example difficulties
df_score['difficulty'] = df_score['difficulty'].apply(lambda x: x.lower())
df_mean_hard = df_score[df_score['difficulty']=='hard'].groupby(['model', 'task_x'])['Score'].agg(["mean"]).unstack()
df_mean_hard['Average Score'] = df_mean_hard.mean(axis=1)
df_mean_hard['model'] = df_mean_hard.index


df_mean_medium = df_score[df_score['difficulty']=='medium'].groupby(['model', 'task_x'])['Score'].agg(["mean"]).unstack()
df_mean_medium['Average Score'] = df_mean_medium.mean(axis=1)
df_mean_medium['model'] = df_mean_medium.index

df_mean_easy = df_score[df_score['difficulty']=='easy'].groupby(['model', 'task_x'])['Score'].agg(["mean"]).unstack()
df_mean_easy['Average Score'] = df_mean_easy.mean(axis=1)
df_mean_easy['model'] = df_mean_easy.index

In [None]:
df_mean = (
    df_score.groupby(['model', 'difficulty'])['Score']
    .agg(['mean'])
    .unstack()
)

df_mean = df_mean.reset_index()
df_mean.columns = ['_'.join(x) for x in df_mean.columns]
df_mean.rename(columns={'model_': 'model', 'mean_easy': 'Easy', 'mean_medium': 'Medium', 'mean_hard': 'Hard'}, inplace=True)

df_mean = df_mean.melt(
    id_vars=['model'],
    value_vars=['Easy', 'Medium', 'Hard'],
    var_name='difficulty',
    value_name='mean score',
)

In [None]:
plt.figure(figsize=(12, 5))
ax = sns.barplot(
    y='mean score',
    x='model',
    data=df_mean,
    palette=palette3,
    hue='difficulty',
    order=model_order,
)
plt.tight_layout()
plt.savefig('Figure_8.png', format='png', dpi=300)
plt.show()

## Figure 35: identity ratio versus protein sequence length

In [None]:
import os
import glob
df_pdb = df[df['task'] == 'pdb']

In [None]:
gt_pattern = f"{BASEDIR}/data/pdb/ground_truth/*.json"

gt_pdb = {}
for path in glob.glob(gt_pattern):
  example = os.path.basename(path).split('.json')[0]
  gt_pdb[example] = {'json': json5.load(open(path, 'r'))}

for k, v in gt_pdb.items():
  v['sequence_length'] = len(v['json']['sequence'])

df_pdb['sequence_length'] = df_pdb.apply(lambda x: gt_pdb[x['example']]['sequence_length'], axis=1)
df_pdb['sequence_length_bins'] = pd.qcut(df_pdb['sequence_length'], q=20)
plt.figure(figsize=(21, 7))

sns.set_style('white')
plt.figure(figsize=(21, 7))
sns.set_style("darkgrid")
sns.set(font_scale=1.5)

sns.boxplot(
    df_pdb[df_pdb['metric'] == 'identity_ratio'],
    y='value',
    x='sequence_length_bins',
)
plt.ylim([0, 1])
plt.xticks(rotation=90)
plt.savefig('pdb_supp.png', format='png', bbox_inches='tight', dpi=300)

plt.show()