In [1]:
import pandas as pd
from pathlib import Path
import os.path
import re
import numpy as np

In [2]:
SINGLE_RES_ROOT = Path("../data/multi_modal_ppi_results/single_run")
CV_RES_ROOT = Path("../data/multi_modal_ppi_results/cross_validation")
PUB_RES_ROOT = Path("../results/")
SINGLE_HPRD_PATH = SINGLE_RES_ROOT / "hprd_run.csv"
SINGLE_BIOINFER_PATH = SINGLE_RES_ROOT / "bioinfer_run.csv"
CV_HPRD_PATH = CV_RES_ROOT / "hprd_run.csv"
CV_BIOINFER_PATH = CV_RES_ROOT / "bioinfer_run.csv"

In [3]:
single_hprd_df = pd.read_csv(SINGLE_HPRD_PATH, index_col="Name")
single_bioinfer_df = pd.read_csv(SINGLE_BIOINFER_PATH, index_col="Name")
cv_hprd_df = pd.read_csv(CV_HPRD_PATH, index_col="Name")
cv_bioinfer_df = pd.read_csv(CV_BIOINFER_PATH, index_col="Name")

In [4]:
# Result csv file for public
single_hprd_df.drop(["Source Name", "best_checkpoint_0fold", "dataset_csv_path", "dataset_feature_tsv_path", "dataset_pdb_path", "User"], axis=1).to_csv(PUB_RES_ROOT / "single_hprd_results.csv")
single_bioinfer_df.drop(["Source Name", "best_checkpoint_0fold", "dataset_csv_path", "dataset_feature_tsv_path", "dataset_pdb_path", "User"], axis=1).to_csv(PUB_RES_ROOT / "single_bioinfer_results.csv")
cv_hprd_df.drop(["Source Name", "best_checkpoint_0fold", "best_checkpoint_1fold", "best_checkpoint_2fold", "best_checkpoint_3fold", "best_checkpoint_4fold", "dataset_csv_path", "dataset_feature_tsv_path", "dataset_pdb_path", "User"], axis=1).to_csv(PUB_RES_ROOT / "cv_hprd_results.csv")
cv_bioinfer_df.drop(["Source Name", "best_checkpoint_0fold", "best_checkpoint_1fold", "best_checkpoint_2fold", "best_checkpoint_3fold", "best_checkpoint_4fold", "dataset_csv_path", "dataset_feature_tsv_path", "dataset_pdb_path", "User"], axis=1).to_csv(PUB_RES_ROOT / "cv_bioinfer_results.csv")

In [5]:
def get_num_data_version(name: str, pat=re.compile(r"gene_feature_v(\d)_log_pca(\d+).*")) -> str:
    try:
        if name is None or (not isinstance(name, str) and np.isnan(name)):
            return ""
        name = os.path.basename(name)
        match = pat.match(name)
        return "Ver" + match.groups()[0] + "_" + match.groups()[1]
    except Exception as e:
        print(name)
        print(type(name))
        raise e

In [6]:
for df in [single_hprd_df, single_bioinfer_df, cv_hprd_df, cv_bioinfer_df]:
    df["numerical_version"] = df["dataset_feature_tsv_path"].apply(get_num_data_version)
    df["text_modality"] = df["model__target_"].apply(lambda x: "Text" in x)
    df["graph_modality"] = df["model__target_"].apply(lambda x: "Graph" in x)
    df["numerical_modality"] = df["model__target_"].apply(lambda x: "Num" in x)

single_hprd_df["dataset"] = "hprd"
cv_hprd_df["dataset"] = "hprd"
single_bioinfer_df["dataset"] = "bioinfer"
cv_bioinfer_df["dataset"] = "bioinfer"

In [7]:
MODULE_CHOICES = [
    "TextModule",
    "GraphModule",
    "NumModule",
    "TextAndGraphModule",
    "TextAndNumModule",
    "GraphAndNumModule",
    "TextAndGraphAndNumModule",
]

METRICS = [
    "val/f1",
    "test/acc",
    "test/prec",
    "test/rec",
    "test/f1",
    "test/auroc"
]

METRICS_CHOICES = [
    x for y in [(f"{x}_mean", f"{x}_std") for x in METRICS]
    for x in y
]
PIVOT_METRIC = "val/f1_mean"
KEY = ["dataset", "text_modality", "graph_modality", "numerical_modality", "numerical_version", "model_with_intermediate_layer", "model_with_tensorfusion_network", "model_with_lowrank_tensorfusion_network"] + METRICS_CHOICES

In [8]:
single_hprd_df[KEY][::-1].to_csv(PUB_RES_ROOT / "main_part_single_hprd_run.csv")
single_bioinfer_df[KEY][::-1].to_csv(PUB_RES_ROOT / "main_part_single_bioinfer_run.csv")
cv_hprd_df[KEY][::-1].to_csv(PUB_RES_ROOT / "main_part_cv_hprd_run.csv")
cv_bioinfer_df[KEY][::-1].to_csv(PUB_RES_ROOT / "main_part_cv_bioinfer_run.csv")

In [9]:
def summarize_results(df: pd.DataFrame, MODULE_CHOICES=MODULE_CHOICES, PIVOT_METRIC=PIVOT_METRIC, METRICS=METRICS, with_std=True):
    def _write_mean_std_from_df(tmp_df, RESULT):
        max_idx = tmp_df[PIVOT_METRIC].idxmax()
        max_row = tmp_df.loc[max_idx]
        for _metric in METRICS:
            mean = max_row[f"{_metric}_mean"]
            std = max_row[f"{_metric}_std"]
            RESULT += f"{100*mean:.2f} "
            if with_std:
                RESULT += f"(±{100*std:.2f}) | "
            else:
                RESULT += " | "
        RESULT = RESULT.strip() + "\n"
        return RESULT

    RESULT = "| Model | " + " | ".join(METRICS) + " |\n"
    RESULT += "| :--- | " + " | ".join(["---:" for _ in METRICS]) + " |\n"
    for module in MODULE_CHOICES:
        module_key = module.replace("Module", "")
        module_key = module_key.replace("And", " & ")
        if  module == "TextAndGraphAndNumModule":
            # Iterate over concat, with_tensorfusion, with_lowrank_tensorfusion
            RESULT += f"| {module_key} (TensorFusion) | "
            tmp_df = df.query(f"index.str.contains('{module}')").query(f"index.str.startswith('{module[0]}')").query(f"model_with_tensorfusion_network == True")
            drop_idx = list(tmp_df.index)
            RESULT = _write_mean_std_from_df(tmp_df, RESULT)

            RESULT += f"| {module_key} (LowrankTensorFusion) | "
            tmp_df = df.query(f"index.str.contains('{module}')").query(f"index.str.startswith('{module[0]}')").query(f"model_with_lowrank_tensorfusion_network == True")
            drop_idx += list(tmp_df.index)
            RESULT = _write_mean_std_from_df(tmp_df, RESULT)

            RESULT += f"| {module_key} (Concat) | "
            # All modelse but with tenosrfusion are concat-model
            tmp_df = df.query(f"index.str.contains('{module}')").query(f"index.str.startswith('{module[0]}')").drop(drop_idx)
            RESULT = _write_mean_std_from_df(tmp_df, RESULT)
        else:
            RESULT += f"| {module_key} | "
            tmp_df = df.query(f"index.str.contains('{module}')").query(f"index.str.startswith('{module[0]}')")
            RESULT = _write_mean_std_from_df(tmp_df, RESULT)
    print(RESULT)

In [10]:
print("Single run: Result for hprd")
print()
summarize_results(single_hprd_df, with_std=False)

print("-"*30)
print("Single run: Result for bioinfer")
print()
summarize_results(single_bioinfer_df, with_std=False)

Single run: Result for hprd

| Model | val/f1 | test/acc | test/prec | test/rec | test/f1 | test/auroc |
| :--- | ---: | ---: | ---: | ---: | ---: | ---: |
| Text | 81.82  | 97.31  | 93.33  | 70.00  | 80.00  | 97.56  |
| Graph | 0.00  | 85.00  | 0.00  | 0.00  | 0.00  | 47.71  |
| Num | 10.81  | 83.85  | 7.69  | 10.00  | 8.70  | 48.45  |
| Text & Graph | 85.71  | 96.92  | 87.50  | 70.00  | 77.78  | 98.31  |
| Text & Num | 85.71  | 94.62  | 71.43  | 50.00  | 58.82  | 95.75  |
| Graph & Num | 18.18  | 82.31  | 3.57  | 5.00  | 4.17  | 47.96  |
| Text & Graph & Num (TensorFusion) | 78.26  | 93.46  | 56.52  | 65.00  | 60.47  | nan  |
| Text & Graph & Num (LowrankTensorFusion) | 69.57  | 93.85  | 60.00  | 60.00  | 60.00  | nan  |
| Text & Graph & Num (Concat) | 90.91  | 96.54  | 82.35  | 70.00  | 75.68  | 98.00  |

------------------------------
Single run: Result for bioinfer

| Model | val/f1 | test/acc | test/prec | test/rec | test/f1 | test/auroc |
| :--- | ---: | ---: | ---: | ---: | ---

In [11]:
print("CV run: Result for hprd")
print()
summarize_results(cv_hprd_df)

print("-"*30)
print("CV run: Result for bioinfer")
print()
summarize_results(cv_bioinfer_df)

CV run: Result for hprd

| Model | val/f1 | test/acc | test/prec | test/rec | test/f1 | test/auroc |
| :--- | ---: | ---: | ---: | ---: | ---: | ---: |
| Text | 78.14 (±4.80) | 97.60 (±1.37) | 80.92 (±18.47) | 70.85 (±16.59) | 73.48 (±12.15) | 94.45 (±8.05) |
| Graph | 10.60 (±5.43) | 67.05 (±20.25) | 2.57 (±2.28) | 27.13 (±22.83) | 4.68 (±4.14) | 51.51 (±8.02) |
| Num | 18.07 (±8.53) | 69.69 (±19.87) | 7.67 (±2.70) | 37.82 (±18.00) | 11.45 (±1.98) | 55.81 (±9.57) |
| Text & Graph | 74.68 (±4.99) | 97.36 (±0.75) | 75.39 (±10.89) | 67.78 (±19.24) | 68.99 (±9.81) | 92.31 (±7.11) |
| Text & Num | 78.47 (±7.98) | 97.60 (±0.99) | 80.30 (±15.33) | 69.33 (±16.20) | 72.18 (±11.11) | 96.24 (±5.15) |
| Graph & Num | 18.32 (±6.81) | 68.68 (±9.15) | 4.00 (±2.66) | 30.43 (±21.32) | 7.06 (±4.72) | 48.14 (±6.93) |
| Text & Graph & Num (TensorFusion) | 79.45 (±5.86) | 97.29 (±0.78) | 77.69 (±20.12) | 71.50 (±17.29) | 70.19 (±6.82) | nan (±nan) |
| Text & Graph & Num (LowrankTensorFusion) | 69.60 (±4.3