# Try committee model

We found that generally the following three models perform with insignificant differences:
- D-MPNN/CGR
- XGB/FP
- LogReg/FP

If we were to use these as a committee (i.e. averaging probabilities from all three predictions), do we get better predictions? 

In [46]:
import pathlib
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wandb

from src.util.definitions import PRED_DIR, DATA_ROOT
from utils import get_runs_as_list

In [23]:
df_true = pd.read_csv(DATA_ROOT / "synferm_dataset_2023-09-05_40018records.csv")

In [5]:
summary_list, config_list, tag_list, name_list  = get_runs_as_list(filters={"jobType": "hparam_best"}
                                                                   )
df_all = pd.json_normalize(config_list).merge(pd.json_normalize(summary_list), left_index=True, right_index=True)
df_all["tags"] = tag_list
df_all["run_id"] = name_list
df_all["run_group"] = [s.rsplit("_", maxsplit=1)[0] for s in name_list]
df_all["Model+Features"] = df_all["name"] + "/" + df_all["decoder.global_features"].str.join("+").str.replace("None", "CGR")

In [6]:
# check available experiments by split
for tag, row in df_all.groupby("tags")[["experiment_id"]].agg(set).iterrows():
    print(tag, "-->", row["experiment_id"])

('0D',) --> {'JG1128', 'JG1109', 'JG1117', 'JG1160', 'JG1106', 'JG1100', 'JG1116', 'JG1135', 'JG1115', 'JG1131'}
('0D_1.25',) --> {'JG1147', 'JG1141', 'JG1159', 'JG1153'}
('0D_10',) --> {'JG1138', 'JG1150', 'JG1156', 'JG1144'}
('0D_2.5',) --> {'JG1140', 'JG1152', 'JG1146', 'JG1158'}
('0D_20',) --> {'JG1143', 'JG1137', 'JG1149', 'JG1155'}
('0D_40',) --> {'JG1148', 'JG1136', 'JG1142', 'JG1154'}
('0D_5',) --> {'JG1145', 'JG1151', 'JG1157', 'JG1139'}
('1D',) --> {'JG1123', 'JG1118', 'JG1125', 'JG1104', 'JG1129', 'JG1126', 'JG1121', 'JG1101', 'JG1132'}
('2D',) --> {'JG1119', 'JG1122', 'JG1102', 'JG1112', 'JG1105', 'JG1127', 'JG1124', 'JG1130', 'JG1133'}
('3D',) --> {'JG1111', 'JG1134', 'JG1103', 'JG1113', 'JG1120', 'JG1107', 'JG1108', 'JG1110', 'JG1114'}


In [54]:
# we look at the 1D split first
exp_ids = ["JG1101", "JG1129", "JG1125"]  # D-MPNN/CGR, XGB/FP, FFN/FP for 1D (LogReg/FP will be JG1186 when trained)
#exp_ids = ["JG1102", "JG1130", "JG1112"]  # D-MPNN/CGR, XGB/FP, LogReg/FP for 2D
#exp_ids = ["JG1103", "JG1111", "JG1108"]  # D-MPNN/CGR, XGB/FP, LogReg/FP for 3D
df_exps = df_all.loc[df_all.experiment_id.isin(exp_ids)]

In [55]:
avg_precision = []
for i, exp in df_exps.iterrows():
    # first we check if predicted values are available
    val_pred_path = PRED_DIR / exp.run_id / "val_preds_last.csv"
    test_pred_path = PRED_DIR / exp.run_id / "test_preds_last.csv"
    
    for name, file in zip(["val", "test"], [val_pred_path, test_pred_path]):
        if file.is_file():
            # import predictions and combine with ground truth
            df = pd.read_csv(file, index_col="idx").merge(df_true, how="left", left_index=True, right_index=True)
            
            # extract predictions
            if exp["training.task"] in ["multilabel", "binary"]:
                y_prob = df[[f"pred_{i}" for i in range(len(exp["target_names"]))]].to_numpy()
                y_hat = (y_prob > 0.5).astype(np.int_)
                y_true = df[exp["target_names"]].to_numpy()
            elif exp["training.task"] == "multiclass":
                y_prob = df[[f"pred_{i}" for i in range(len(le.classes_))]].to_numpy()
                y_hat = np.argmax(y_prob, axis=1)
                y_true = le.transform(df["major_A-C"].to_numpy())
            else:
                raise ValueError("Unexpected run_type")
        
            # calculate additional metrics
            if exp["training.task"] == "multilabel":
                avg_precision.append(average_precision_score(y_true, y_prob, average="macro") )       
            elif exp["training.task"] == "binary":
                # currently not needed
                ...
            elif exp["training.task"] == "multiclass":
                # currently not needed
                ...
        else:
            print(f"{name} predictions not found for {exp.run_id} ({exp.experiment_id})")

val predictions not found for 2023-10-25-135320_203870_fold8 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold8 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold7 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold7 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold6 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold6 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold5 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold5 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold4 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold4 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold3 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold3 (JG1129)
val predictions not found for 2023-10-25-135320_203870_fold2 (JG1129)
test predictions not found for 2023-10-25-135320_203870_fold2 (JG1129)
val predictio

In [40]:
avg_precision

[0.8606858092918435,
 0.939935396961079,
 0.9426246803901227,
 0.6814280094921755,
 0.8718325828935735,
 0.7668198187203904,
 0.9078859310546591,
 0.8893935999766921,
 0.8986881863030569,
 0.8705959829509929,
 0.899278411641179,
 0.9166458771759424,
 0.819170328677186,
 0.84844715283895,
 0.9238185780887608,
 0.8505691027622345,
 0.8236051543044508,
 0.9009267951065478,
 0.8431993605491148,
 0.9322388093134286,
 0.9295661740412372,
 0.7930076255136319,
 0.898223033144368,
 0.7435044059802235,
 0.8845574774689853,
 0.8964382073648984,
 0.8873453813268656,
 0.8638725342304555,
 0.9078190376000951,
 0.9165069250661829,
 0.8677211437984745,
 0.8590571179849403,
 0.9309577431735795,
 0.8803839873246893,
 0.8617059920625434,
 0.8892900297074432]