# Results

Parsing results for reproducibility

In [1]:
import os
import sys
import pickle

import pandas as pd
import numpy as np

In [2]:
from perturbqa import load_de, auc_per_gene

## Evaluating individual predictions

Example of parsing individual files in `results.zip`

In [3]:
labels = load_de("k562")["test"]

In [4]:
with open("../results/gears/k562.pkl", "rb") as f:
    preds = pickle.load(f)

In [5]:
preds.keys()

dict_keys(['key', 'de_pred', 'de_true', 'dir_pred', 'dir_true'])

In [6]:
if "key" in preds:
    de_key = preds["key"]
else:
    de_key = preds["de_key"]
de_pred = preds["de_pred"]
de_true = preds["de_true"]

In [7]:
auc_per_gene(de_key, de_pred, de_true)

0.5451993240694406

In [8]:
auc_per_gene(de_key, de_pred, [item["label"] for item in labels])

0.5451993240694406

## Summary results

Example code for producing Table 1 (without uncertainties).

In [9]:
fp = "../results/summary.csv"

In [10]:
df_results = pd.read_csv(fp)

In [11]:
def format_row(mean, is_best):
    if is_best:
        return f"$\\textbf{{{mean:.2f}}}$"
    return f"${mean:.2f}$"

In [12]:
names = [
    "MLP", "GAT", "GEARS",
     None,
     "GenePT-Gene",
     "GenePT-Protein",
     "LLM-NoCoT",
     "LLM-0-Shot",
     "LLM-Prompt",
     None,
     "LLM-FewShot",
]
latex_names = [
    "\\textsc{Mlp}", "\\textsc{Gat}", "\\textsc{Gears}",
    None,
    "\\textsc{GenePt-Gene}", "\\textsc{GenePt-Prot}",
    "\\textsc{Llm} (No CoT)",
    "\\textsc{Llm} (No retrieval)",
    "Retrieval (No \\textsc{Llm})",
    None,
    "\\ours{}",
]
tasks = ["de", "dir"]
datasets = ["k562_gw", "rpe1_essential", "hepg2", "jurkat", "k562_gw_path"]

In [13]:
df = df_results

model_metrics = {}
best_metrics = {}
for dataset in datasets:
    for task in tasks:
        means = []
        for model in names:
            if model is None:
                continue
            metric_col = df[(df["dataset"] == dataset) &
                            (df["task"] == task) &
                            (df["model"] == model)]["AUC"]
            mean = metric_col.mean().item()
            model_metrics[(dataset, task, model)] = mean
            means.append(mean)
        if len(means) == 0:  # shouldn't ever be called
            print(task, model)
        best_metrics[(dataset, task)] = max(means)

In [14]:
for task in tasks:
    for i, (model, latex_name) in enumerate(zip(names, latex_names)):
        if model is None:
            print("\\cmidrule(l{\\tabcolsep}){1-7}")
            continue
        # print num nodes on first line
        line = "&" + latex_name
        for dataset in datasets:
            mean = model_metrics[(dataset, task, model)]
            is_best = np.isclose(mean, best_metrics[(dataset, task)], atol=0.005)
            line += f"& {format_row(mean, is_best)}"
        line += "\\\\"
        print(line)
    print("\\midrule\\midrule")

&\textsc{Mlp}& $0.48$& $0.51$& $0.51$& $0.49$& $0.49$\\
&\textsc{Gat}& $0.53$& $0.55$& $0.58$& $0.51$& $0.55$\\
&\textsc{Gears}& $0.55$& $0.49$& $0.48$& $0.51$& $0.50$\\
\cmidrule(l{\tabcolsep}){1-7}
&\textsc{GenePt-Gene}& $0.57$& $0.55$& $0.56$& $0.55$& $0.58$\\
&\textsc{GenePt-Prot}& $0.59$& $0.56$& $0.54$& $0.55$& $0.59$\\
&\textsc{Llm} (No CoT)& $0.52$& $0.52$& $0.51$& $0.52$& $0.51$\\
&\textsc{Llm} (No retrieval)& $0.52$& $0.48$& $0.49$& $0.49$& $0.50$\\
&Retrieval (No \textsc{Llm})& $0.58$& $\textbf{0.59}$& $0.56$& $0.56$& $\textbf{0.66}$\\
\cmidrule(l{\tabcolsep}){1-7}
&\ours{}& $\textbf{0.62}$& $\textbf{0.59}$& $\textbf{0.62}$& $\textbf{0.59}$& $0.62$\\
\midrule\midrule
&\textsc{Mlp}& $0.53$& $0.51$& $0.49$& $0.45$& $0.48$\\
&\textsc{Gat}& $0.59$& $0.60$& $0.63$& $0.58$& $0.51$\\
&\textsc{Gears}& $0.67$& $0.61$& $0.52$& $0.52$& $\textbf{0.70}$\\
\cmidrule(l{\tabcolsep}){1-7}
&\textsc{GenePt-Gene}& $0.49$& $0.61$& $0.61$& $0.57$& $0.54$\\
&\textsc{GenePt-Prot}& $0.55$& $0.61$& $