# Analyze Hyperparameter Sweeps

In [126]:
import pathlib
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wandb

from src.util.definitions import PRED_DIR, DATA_ROOT

In [127]:
api = wandb.Api()
runs = api.runs("jugoetz/synferm-predictions", filters={"jobType": "hparam_optimization"})

In [128]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
         if not k.startswith('_')})

    tag_list.append(run.tags)

    # .name is the human-readable name of the run.
    name_list.append(run.name)

In [129]:
run_df = pd.json_normalize(config_list)
run_df["tags"] = tag_list

In [130]:
run_df.columns

Index(['name', 'run_id', 'run_group', 'num_labels', 'accelerator',
       'target_names', 'data_hash_key', 'label_binarizer', 'atom_feature_size',
       'bond_feature_size', 'global_feature_size', 'decoder.depth',
       'decoder.out_bias', 'decoder.activation', 'decoder.hidden_bias',
       'decoder.hidden_size', 'decoder.out_sigmoid', 'decoder.dropout_ratio',
       'decoder.global_features', 'decoder.global_features_file',
       'encoder.bias', 'encoder.depth', 'encoder.reaction',
       'encoder.activation', 'encoder.graph_type', 'encoder.aggregation',
       'encoder.featurizers', 'encoder.hidden_size', 'encoder.dropout_ratio',
       'training.task', 'training.max_epochs', 'optimizer.lr',
       'optimizer.lr_scheduler.epochs', 'optimizer.lr_scheduler.lr_min',
       'optimizer.lr_scheduler.lr_warmup_step',
       'optimizer.lr_scheduler.scheduler_name', 'optimizer.weight_decay',
       'decoder.C', 'decoder.solver', 'decoder.penalty', 'decoder.gamma',
       'decoder.max_depth

In [131]:
# see the unique configurations
architecture_cols = ["name", "tags", "decoder.global_features", "training.task"]
run_df[architecture_cols].applymap(lambda x: tuple(x) if isinstance(x, list) else x).drop_duplicates().sort_values(by=architecture_cols)

Unnamed: 0,name,tags,decoder.global_features,training.task
1710,D-MPNN,"(0D,)","(None,)",multilabel
1530,D-MPNN,"(1D,)","(None,)",multilabel
1350,D-MPNN,"(2D,)","(None,)",multilabel
1170,D-MPNN,"(3D,)","(None,)",multilabel
180,GCN,"(1D,)","(None,)",multilabel
0,GCN,"(2D,)","(None,)",multilabel
1080,LogisticRegression,"(0D,)","(RDKit,)",multilabel
360,LogisticRegression,"(3D,)","(FP,)",multilabel
630,LogisticRegression,"(3D,)","(RDKit,)",multilabel
900,XGB,"(0D,)","(RDKit,)",multilabel


In [91]:
# reduce df to columns with differences
nunique = run_df.applymap(lambda x: tuple(x) if isinstance(x, list) else x).nunique()
cols_to_drop = nunique[nunique == 1].index
run_df = run_df.drop(cols_to_drop, axis=1)

In [92]:
# get all runs belonging to a certain hyperparameter sweep
run_df.loc[
    (run_df.name == "GCN") &
    (run_df["decoder.global_features"].apply(lambda x: True if x == ["None"] else False)) &
    (run_df.tags.str.get(0) == "1D")
]

Unnamed: 0,name,run_id,run_group,accelerator,global_feature_size,decoder.depth,decoder.hidden_size,decoder.dropout_ratio,decoder.global_features,encoder.depth,...,encoder.hidden_size,encoder.dropout_ratio,optimizer.lr,optimizer.lr_scheduler.lr_min,decoder.C,decoder.gamma,decoder.reg_alpha,decoder.reg_lambda,decoder.learning_rate,tags
180,GCN,2023-10-12-031835_949707_fold8,2023-10-12-031835_949707,gpu,0,3.0,88.0,9.192166e-07,[None],3.0,...,799.0,9.192166e-07,0.000050,0.000005,,,,,,[1D]
181,GCN,2023-10-12-031835_949707_fold7,2023-10-12-031835_949707,gpu,0,3.0,88.0,9.192166e-07,[None],3.0,...,799.0,9.192166e-07,0.000050,0.000005,,,,,,[1D]
182,GCN,2023-10-12-031835_949707_fold6,2023-10-12-031835_949707,gpu,0,3.0,88.0,9.192166e-07,[None],3.0,...,799.0,9.192166e-07,0.000050,0.000005,,,,,,[1D]
183,GCN,2023-10-12-031835_949707_fold5,2023-10-12-031835_949707,gpu,0,3.0,88.0,9.192166e-07,[None],3.0,...,799.0,9.192166e-07,0.000050,0.000005,,,,,,[1D]
184,GCN,2023-10-12-031835_949707_fold4,2023-10-12-031835_949707,gpu,0,3.0,88.0,9.192166e-07,[None],3.0,...,799.0,9.192166e-07,0.000050,0.000005,,,,,,[1D]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,GCN,2023-10-11-173138_393712_fold4,2023-10-11-173138_393712,gpu,0,2.0,124.0,9.192166e-07,[None],3.0,...,63.0,9.192166e-07,0.001545,0.000154,,,,,,[1D]
356,GCN,2023-10-11-173138_393712_fold3,2023-10-11-173138_393712,gpu,0,2.0,124.0,9.192166e-07,[None],3.0,...,63.0,9.192166e-07,0.001545,0.000154,,,,,,[1D]
357,GCN,2023-10-11-173138_393712_fold2,2023-10-11-173138_393712,gpu,0,2.0,124.0,9.192166e-07,[None],3.0,...,63.0,9.192166e-07,0.001545,0.000154,,,,,,[1D]
358,GCN,2023-10-11-173138_393712_fold1,2023-10-11-173138_393712,gpu,0,2.0,124.0,9.192166e-07,[None],3.0,...,63.0,9.192166e-07,0.001545,0.000154,,,,,,[1D]
