# Setup

In [1]:
import torch

if 'google.colab' in str(get_ipython()):
  print('Running on Colab')
  running_on_colab = True
else:
  print('Not running on Colab')
  running_on_colab = False

if running_on_colab:
    print(torch.__version__)
    !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
    !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
    !pip install -q git+https://github.com/snap-stanford/deepsnap.git
    !pip install pyarrow
    !pip install fastparquet

    from google.colab import drive
    drive.mount('/content/drive')
    filepath = '/content/drive/MyDrive/GCNN/'
    data_folder = filepath+"graph_data/"
    experiments_folder = filepath+"experiments/merged_types_experiment/"

else:
    data_folder = "../../../data/processed/graph_data_nohubs/merged_types/"
    experiments_folder = "../../../data/experiments/design_space_merged_experiment/"

Not running on Colab


In [2]:
import base_model, training_utils
import pandas as pd
import itertools
import datetime
from sklearn.metrics import roc_auc_score

# Load data

In [3]:
path = data_folder + "split_dataset/"
original_train_data, original_val_data = training_utils.load_data(path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Experiments

## Run a single experiment

In [5]:
def run_experiment(params, train_set, val_set):

    # Initialize node features
    train_set = training_utils.initialize_features(
        train_set, params["feature_type"], params["feature_dim"])
    val_set = training_utils.initialize_features(
        val_set, params["feature_type"], params["feature_dim"])
    train_set.to(device)
    val_set.to(device)

    # Initialize model
    model = base_model.base_model(
        params, train_set.metadata(), params["supervision_types"])
    model = model.to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=params['lr'], weight_decay=params["weight_decay"])
    train_losses = []
    val_losses = []
    train_scores = []
    val_scores = []

    metric = roc_auc_score
    epochs = params["epochs"]

    early_stopper = training_utils.EarlyStopper(
        params["patience"], params["delta"])
    for epoch in range(epochs):
        train_loss = training_utils.train(model, optimizer, train_set)
        val_loss = training_utils.get_val_loss(model, val_set)

        train_score = training_utils.test(model, train_set, metric)
        val_score = training_utils.test(model, val_set, metric)

        train_losses.append(train_loss)
        train_scores.append(train_score)

        val_scores.append(val_score)
        val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            print("Early stopping")
            break

    val_auc = training_utils.test(model, val_set, roc_auc_score)
    curve_data = [train_losses, val_losses, train_scores, val_scores]

    return val_auc, model, curve_data

## Run a grid of experiments

In [6]:
from sklearn.model_selection import ParameterGrid
def perform_hyperparameter_search(param_grid, train_set, val_set):
  
  default = {
      "hidden_channels":[32],
      "conv_type":["SAGEConv"],
      "batch_norm": [True],
      "dropout":[0.1],
      "micro_aggregation":["mean"],
      "macro_aggregation":["mean"],
      "layer_connectivity":[None],
      "L2_norm":[False],
      "pre_process_layers":[0],
      "msg_passing_layers":[2],
      "post_process_layers":[0],
      "normalize_output":[False],
      "jumping_knowledge":[False],

      "feature_dim":[10],
      "feature_type":["random"],
      "supervision_types":[[('gene_protein', 'gda', 'disease')]],

      'weight_decay': [1e-3],
      'lr': [0.001],
      'epochs':[400],
      "patience":[10],
      "delta":[0.1]
  }

  for arg in default:
    if arg not in param_grid:
      param_grid[arg] = default[arg]

  grid = ParameterGrid(param_grid)

  auc_results = []
  models = []

  for eid,params in enumerate(grid):
    # Launch a training experiment using the current set of parameters
    val_auc,current_model,curve_data = run_experiment(
                   params,
                   train_set,
                   val_set)
    
    params["auc"] = val_auc
    params["curve_data"] = curve_data

    auc_results.append(params)
    models.append(current_model)

    print(f"Validation AUC: {round(val_auc,2)}. Iteration: {eid+1} of {grid.__len__()}")

  return auc_results, models

## Run multiple grids of experiments

In [21]:
def run_multiple_grids(grid_list,train_data,val_data):
    all_results = []
    all_models = []

    for i,grid in enumerate(grid_list):
        print(f"Experiment grid {i+1} of {len(grid_list)}")
        experiment_results, models = perform_hyperparameter_search(grid, train_data,val_data)
        results_df = pd.DataFrame(experiment_results)
        all_results.append(results_df)
        all_models.append(models)

    final_results = pd.concat(all_results).reset_index(drop=True)
    final_models = list(itertools.chain(*all_models))

    date = datetime.datetime.now()
    fdate = date.strftime("%d_%m_%y__%H_%M_%S")
    fname = experiments_folder+"experiment_"+fdate+".parquet"
    final_results.to_parquet(fname)

    for i,model in enumerate(final_models):
        model_name = f"experiment_{i}"
        training_utils.save_model(model,experiments_folder,model_name)

In [38]:
default_grid = {
    "hidden_channels":[32],
    "conv_type":["SAGEConv"],
    "batch_norm": [True],
    "dropout":[0.1],
    "micro_aggregation":["sum"],
    "macro_aggregation":["sum"],
    "layer_connectivity":[None],
    "L2_norm":[True],
    "pre_process_layers":[0],
    "msg_passing_layers":[2],
    "post_process_layers":[1],
    "normalize_output":[False],
    "jumping_knowledge":[False],

    "feature_dim":[10],
    "feature_type":["ones"],
    "supervision_types":[[('gene_protein', 'gda', 'disease')]],

    'weight_decay': [1e-3],
    'lr': [0.001],
    'epochs':[400],
    "patience":[10],
    "delta":[0.1]
}

grid_list = []

grid_1 = {"hidden_channels":[32,64,128],"micro_aggregation":["sum","mean","max"],"macro_aggregation":["sum","mean","max"],"feature_dim":[10,50,100],"feature_type":["ones","random"]}
grid_list.append(default_grid|grid_1)

grid_2 = {"hidden_channels":[32],"conv_type":["SAGEConv","GATConv"],"micro_aggregation":["sum","mean","max"],"macro_aggregation":["sum","mean","max"],"feature_type":["ones","random"]}
grid_list.append(default_grid|grid_2)

grid_3 = {"layer_connectivity":[None,"skipsum"],"msg_passing_layers":[2,3,4,5],"jumping_knowledge":[False,True]}
grid_list.append(default_grid|grid_3)

grid_4 = {"L2_norm":[True,False],"conv_type":["SAGEConv","GATConv"],"normalize_output":[True,False]}
grid_list.append(default_grid|grid_4)

grid_5 = {"pre_process_layers":[0,1,2],"post_process_layers":[0,1,2],"msg_passing_layers":[0,1,2],"feature_type":["ones","random"]}
grid_list.append(default_grid|grid_5)

grid_6 = {"batch_norm":[True,False],"dropout":[0,0.1,0.01]}
grid_list.append(default_grid|grid_6)

num_experiments = sum([len(list(itertools.chain(*grid.values()))) for grid in grid_list])

In [39]:
print(f"Running {num_experiments} experiments ...")
run_multiple_grids(grid_list,original_train_data,original_val_data)

Running 159 experiments ...
Experiment 1 of 6


KeyboardInterrupt: 