# Analyze Hyperparameter Sweeps
Investigate best hyperparameters across different experiments.

In [None]:
import pathlib
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import wandb

from src.util.definitions import PRED_DIR, DATA_ROOT
from utils import get_runs_as_list

In [None]:
summary_list, config_list, tag_list, name_list = get_runs_as_list(filters={"jobType": "hparam_optimization"})

run_df = pd.json_normalize(config_list).merge(pd.json_normalize(summary_list), left_index=True, right_index=True)
run_df["tags"] = tag_list

run_df.head()

In [None]:
# see the unique configurations
architecture_cols = ["name", "tags", "decoder.global_features", "training.task", "experiment_id"]
run_df[architecture_cols].applymap(lambda x: tuple(x) if isinstance(x, list) else x).drop_duplicates().sort_values(by="experiment_id")

In [None]:
# reduce df to columns with differences
nunique = run_df.applymap(lambda x: tuple(x) if isinstance(x, list) else x).nunique()
cols_to_drop = nunique[nunique == 1].index
run_df = run_df.drop(cols_to_drop, axis=1)

In [None]:
# get all runs belonging to a certain hyperparameter sweep
run_df.loc[run_df["experiment_id"] == "JG1486"].columns

In [None]:
df_plot = run_df.loc[run_df["experiment_id"] == "JG1130"]

In [None]:
# FFN hparams
fig = go.Figure(data=
   go.Parcoords(
       line = dict(color = df_plot['val/avgPrecision_macro'],
           colorscale = 'Electric',
           showscale = True,
           cmin = 0.85,
           cmax = 1.0),
       dimensions = list([
           dict(range = [16, 512],
                label = 'dec.hidden_size', values = df_plot['decoder.hidden_size']),
           dict(range = [1, 3],
                label = 'dec.depth', values = df_plot['decoder.depth']),
           dict(range = [5e-5, 5e-3],
                label = 'learning_rate', values = df_plot['optimizer.lr'], tickformat = '.1e'),
           # Add more variables as needed
       ])
   )
)
fig.show()


In [None]:
# XGB hparams
fig = go.Figure(data=
   go.Parcoords(
       line = dict(color = df_plot['val/avgPrecision_macro'],
           colorscale = 'Electric',
           showscale = True,
           cmin = 0.93,
           cmax = 1.0),
       dimensions = list([
           dict(range = [1e-4, 10],
                label = 'gamma', values = df_plot['decoder.gamma'], tickformat = '.1e'),
           dict(range = [1e-3, 1],
                label = 'learning_rate', values = df_plot['decoder.learning_rate'], tickformat = '.1e'),
           dict(range = [1e-4, 10],
                label = 'L1 regularization', values = df_plot['decoder.reg_alpha'], tickformat = '.1e'),
           dict(range = [1e-4, 10],
                label = 'L2 regularization', values = df_plot['decoder.reg_lambda'], tickformat = '.1e'),
       ])
   )
)
fig.show()


In [None]:
# just look at the best hparams now

summary_list, config_list, tag_list, name_list = get_runs_as_list(filters={"jobType": "hparam_best"})

run_df = pd.json_normalize(config_list).merge(pd.json_normalize(summary_list), left_index=True, right_index=True)
run_df["tags"] = tag_list
run_df["run_id"] = name_list
run_df["run_group"] = [s.rsplit("_", maxsplit=1)[0] for s in name_list]
run_df["Model+Features"] = run_df["name"] + "/" + run_df["decoder.global_features"].str.join("+").str.replace("None", "CGR")

run_df.head()

In [None]:
# only look at the hyperparameters now
hparams = run_df.drop_duplicates(subset="experiment_id")[
    ["experiment_id", 
     "encoder.depth",  # all GNNs
     "encoder.hidden_size",  # all GNNs
     "decoder.depth",  # all NNs
     "decoder.hidden_size",  # all NNs
     "optimizer.lr",  # all NNs
     "decoder.gamma",  # XGB
     "decoder.learning_rate",  # XGB
     "decoder.reg_alpha",  # XGB
     "decoder.reg_lambda",  # XGB
     "decoder.C",  # Logistic Regression
    ]]
hparams.head()

In [None]:
hparams.query("experiment_id == 'JG1131'").squeeze()

In [None]:
def get_hparams(experiment_id):
    flexible_hparams = {
        "D-MPNN": [
            "encoder.depth",
            "encoder.hidden_size",
            "decoder.depth",
            "decoder.hidden_size",
            "optimizer.lr",
        ],
        "GCN": [
            "encoder.depth",
            "encoder.hidden_size",
            "decoder.depth",
            "decoder.hidden_size",
            "optimizer.lr",
        ],
        "AttentiveFP": [
            "encoder.depth",
            "encoder.hidden_size",
            "decoder.depth",
            "decoder.hidden_size",
            "optimizer.lr",
        ],
        "GraphSAGE": [
            "encoder.depth",
            "encoder.hidden_size",
            "decoder.depth",
            "decoder.hidden_size",
            "optimizer.lr",
        ],
        "FFN": [
            "decoder.depth",
            "decoder.hidden_size",
            "optimizer.lr",
        ],
        "XGB": [
            "decoder.gamma",
            "decoder.learning_rate",
            "decoder.reg_alpha",
            "decoder.reg_lambda",
        ],
        "LogisticRegression": [
            "decoder.C",
        ],
    }
    api = wandb.Api()
    runs = api.runs("jugoetz/synferm-predictions", filters={"$and": [{"config.experiment_id": experiment_id}, {"jobType": "hparam_best"}]})

    config = { # we use the fact that all runs with one experiment_id have identical hparams
            k: v for k,v in pd.json_normalize(runs[0].config, sep=".").to_dict(orient='records')[0].items()
         if k in flexible_hparams[runs[0].config["name"]]
        }
    
    return config

In [None]:
get_hparams("JG1486")

In [None]:
data = []
for i in range(0,60):
    exp_id = f"JG11{i:02}"
    exp_data = {"experiment_id": exp_id}
    exp_data.update(get_hparams(exp_id))
    data.append(exp_data)
hparams = pd.DataFrame(data).convert_dtypes()

In [None]:
hparams = hparams.convert_dtypes()

In [None]:
# look at LogReg runs only
logreg = hparams.loc[~hparams["decoder.C"].isnull()].dropna(axis=1)
logreg

In [None]:
# look at XGB runs only
xgb = hparams.loc[~hparams["decoder.gamma"].isnull()].dropna(axis=1)
xgb

In [None]:
pd.plotting.parallel_coordinates(xgb, "experiment_id")

In [None]:
# look at FFN runs only
ffn = hparams.loc[(~hparams["optimizer.lr"].isnull()) & hparams["encoder.depth"].isnull()].dropna(axis=1)
ffn

In [None]:
# look at (all) GNN runs only
gnn = hparams.loc[~hparams["encoder.depth"].isnull()].dropna(axis=1)
gnn

In [None]:
pd.plotting.parallel_coordinates(gnn, "experiment_id")