# Analyze Hyperparameter Sweeps

In [1]:
import pathlib
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wandb

from src.util.definitions import PRED_DIR, DATA_ROOT

In [28]:
def get_runs_as_list(project="jugoetz/synferm-predictions", filters={}):
    api = wandb.Api()
    runs = api.runs(project, filters=filters)
    summary_list, config_list, name_list, tag_list = [], [], [], []
    for run in runs:
        # .summary contains output keys/values for
        # metrics such as accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)
    
        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k,v in run.config.items()
             if not k.startswith('_')})
    
        tag_list.append(run.tags)
    
        # .name is the human-readable name of the run.
        name_list.append(run.name)
    return summary_list, config_list, tag_list, name_list

In [10]:
summary_list, config_list, tag_list, name_list = get_runs_as_list(filters={"jobType": "hparam_optimization"})

In [11]:
run_df = pd.json_normalize(config_list)
run_df["tags"] = tag_list

In [12]:
run_df.head()

Unnamed: 0,name,run_id,run_group,num_labels,accelerator,target_names,data_hash_key,experiment_id,label_binarizer,atom_feature_size,...,decoder.penalty,decoder.out_sigmoid,decoder.gamma,decoder.max_depth,decoder.reg_alpha,decoder.reg_lambda,decoder.n_estimators,decoder.learning_rate,decoder.colsample_bytree,tags
0,GraphSAGE,2023-10-23-182059_353419_fold6,2023-10-23-182059_353419,3,gpu,"[binary_A, binary_B, binary_C]",8ece12b7,JG1121,LabelBinarizer(),66,...,,,,,,,,,,[1D]
1,GraphSAGE,2023-10-23-182059_353419_fold5,2023-10-23-182059_353419,3,gpu,"[binary_A, binary_B, binary_C]",8ece12b7,JG1121,LabelBinarizer(),66,...,,,,,,,,,,[1D]
2,GraphSAGE,2023-10-23-182059_353419_fold4,2023-10-23-182059_353419,3,gpu,"[binary_A, binary_B, binary_C]",8ece12b7,JG1121,LabelBinarizer(),66,...,,,,,,,,,,[1D]
3,GraphSAGE,2023-10-23-182059_353419_fold3,2023-10-23-182059_353419,3,gpu,"[binary_A, binary_B, binary_C]",8ece12b7,JG1121,LabelBinarizer(),66,...,,,,,,,,,,[1D]
4,GraphSAGE,2023-10-23-182059_353419_fold2,2023-10-23-182059_353419,3,gpu,"[binary_A, binary_B, binary_C]",8ece12b7,JG1121,LabelBinarizer(),66,...,,,,,,,,,,[1D]


In [18]:
# see the unique configurations
architecture_cols = ["name", "tags", "decoder.global_features", "training.task", "experiment_id"]
run_df[architecture_cols].applymap(lambda x: tuple(x) if isinstance(x, list) else x).drop_duplicates().sort_values(by="experiment_id")

Unnamed: 0,name,tags,decoder.global_features,training.task,experiment_id
3248,D-MPNN,"(0D,)","(None,)",multilabel,JG1100
3068,D-MPNN,"(1D,)","(None,)",multilabel,JG1101
2888,D-MPNN,"(2D,)","(None,)",multilabel,JG1102
2708,D-MPNN,"(3D,)","(None,)",multilabel,JG1103
1717,GCN,"(1D,)","(None,)",multilabel,JG1104
1537,GCN,"(2D,)","(None,)",multilabel,JG1105
2618,LogisticRegression,"(0D,)","(RDKit,)",multilabel,JG1106
2167,LogisticRegression,"(3D,)","(RDKit,)",multilabel,JG1107
1897,LogisticRegression,"(3D,)","(FP,)",multilabel,JG1108
2438,XGB,"(0D,)","(RDKit,)",multilabel,JG1109


In [19]:
# reduce df to columns with differences
nunique = run_df.applymap(lambda x: tuple(x) if isinstance(x, list) else x).nunique()
cols_to_drop = nunique[nunique == 1].index
run_df = run_df.drop(cols_to_drop, axis=1)

In [22]:
# get all runs belonging to a certain hyperparameter sweep
run_df.loc[run_df["experiment_id"] == "JG1111"]

Unnamed: 0,name,run_id,run_group,accelerator,data_hash_key,experiment_id,global_feature_size,decoder.depth,decoder.hidden_size,decoder.dropout_ratio,...,encoder.dropout_ratio,optimizer.lr,optimizer.lr_scheduler.lr_min,global_featurizer_state_dict_path,decoder.C,decoder.gamma,decoder.reg_alpha,decoder.reg_lambda,decoder.learning_rate,tags
1987,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,2.600161,0.013567,0.000100,0.028978,[3D]
1988,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,2.600161,0.013567,0.000100,0.028978,[3D]
1989,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,2.600161,0.013567,0.000100,0.028978,[3D]
1990,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,2.600161,0.013567,0.000100,0.028978,[3D]
1991,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,2.600161,0.013567,0.000100,0.028978,[3D]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2162,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,0.224077,1.702015,0.099603,0.014645,[3D]
2163,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,0.224077,1.702015,0.099603,0.014645,[3D]
2164,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,0.224077,1.702015,0.099603,0.014645,[3D]
2165,XGB,,,cpu,,JG1111,3072,,,,...,,,,,,0.224077,1.702015,0.099603,0.014645,[3D]


In [29]:
best_summary_list, best_config_list, best_tag_list, best_name_list = get_runs_as_list(filters={"$and": 
                                                             [{'created_at': 
                                                              {"$gt": '2023-10-17T00'}},
                                                              {"jobType": "hparam_best"}
                                                             ]})

In [30]:
best_run_df = pd.json_normalize(best_config_list).merge(pd.json_normalize(best_summary_list), left_index=True, right_index=True)
best_run_df["tags"] = best_tag_list

In [31]:
best_run_df

Unnamed: 0,name,run_id,run_group,num_labels,accelerator,target_names,data_hash_key,experiment_id,label_binarizer,atom_feature_size,...,test/loss_target_binary_C,train/loss_target_binary_B,val/loss_target_binary_B,val/loss_target_binary_A,test/loss_target_binary_B,val/loss_target_binary_C,train/loss_target_binary_A,train/loss_target_binary_C,test/loss_target_binary_A,tags
0,FFN,2023-10-22-144317_348898_fold8,2023-10-22-144317_348898,3,gpu,"[binary_A, binary_B, binary_C]",bf5dd14a,JG1120,LabelBinarizer(),66,...,,,,,,,,,,[3D]
1,FFN,2023-10-22-144317_348898_fold7,2023-10-22-144317_348898,3,gpu,"[binary_A, binary_B, binary_C]",bf5dd14a,JG1120,LabelBinarizer(),66,...,,,,,,,,,,[3D]
2,FFN,2023-10-22-144317_348898_fold6,2023-10-22-144317_348898,3,gpu,"[binary_A, binary_B, binary_C]",bf5dd14a,JG1120,LabelBinarizer(),66,...,,,,,,,,,,[3D]
3,FFN,2023-10-22-144317_348898_fold5,2023-10-22-144317_348898,3,gpu,"[binary_A, binary_B, binary_C]",bf5dd14a,JG1120,LabelBinarizer(),66,...,,,,,,,,,,[3D]
4,FFN,2023-10-22-144317_348898_fold4,2023-10-22-144317_348898,3,gpu,"[binary_A, binary_B, binary_C]",bf5dd14a,JG1120,LabelBinarizer(),66,...,,,,,,,,,,[3D]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,D-MPNN,2023-10-18-113347_394236_fold4,2023-10-18-113347_394236,3,gpu,"[binary_A, binary_B, binary_C]",3b8ba5c1,JG1100,LabelBinarizer(),66,...,,,,,,,,,,[0D]
185,D-MPNN,2023-10-18-113347_394236_fold3,2023-10-18-113347_394236,3,gpu,"[binary_A, binary_B, binary_C]",3b8ba5c1,JG1100,LabelBinarizer(),66,...,,,,,,,,,,[0D]
186,D-MPNN,2023-10-18-113347_394236_fold2,2023-10-18-113347_394236,3,gpu,"[binary_A, binary_B, binary_C]",3b8ba5c1,JG1100,LabelBinarizer(),66,...,,,,,,,,,,[0D]
187,D-MPNN,2023-10-18-113347_394236_fold1,2023-10-18-113347_394236,3,gpu,"[binary_A, binary_B, binary_C]",3b8ba5c1,JG1100,LabelBinarizer(),66,...,,,,,,,,,,[0D]


In [32]:
best_run_df.groupby("experiment_id")["val/loss"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
JG1100,9.0,0.227589,0.006173,0.219106,0.22436,0.227037,0.228742,0.23876
JG1101,9.0,0.371147,0.068625,0.271207,0.32789,0.382616,0.40382,0.4996
JG1102,9.0,0.449028,0.093637,0.261492,0.413861,0.455403,0.482736,0.618112
JG1103,9.0,0.469643,0.0424,0.412094,0.428986,0.480422,0.500554,0.521829
JG1104,9.0,0.588049,0.020449,0.550459,0.580384,0.588077,0.599272,0.614311
JG1105,9.0,0.60356,0.04211,0.50021,0.599311,0.616588,0.617894,0.645441
JG1106,9.0,0.309784,0.007382,0.299517,0.305575,0.3114,0.313191,0.321878
JG1107,9.0,0.479645,0.046854,0.410849,0.433613,0.496879,0.519891,0.528661
JG1108,9.0,0.460134,0.052531,0.347846,0.436043,0.462797,0.496642,0.522606
JG1109,9.0,0.216459,0.005935,0.207523,0.214061,0.216056,0.219318,0.225274
