# Setup

In [1]:
import torch

if 'google.colab' in str(get_ipython()):
  print('Running on Colab')
  running_on_colab = True
else:
  print('Not running on Colab')
  running_on_colab = False

if running_on_colab:
    print(torch.__version__)
    !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
    !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
    !pip install -q git+https://github.com/snap-stanford/deepsnap.git
    !pip install pyarrow
    !pip install fastparquet

    from google.colab import drive
    drive.mount('/content/drive')
    filepath = '/content/drive/MyDrive/GCNN/'
    data_folder = filepath+"graph_data/"
    experiments_folder = filepath+"experiments/merged_types_experiments/"

    import sys
    sys.path.append(filepath + "run_in_colab")

else:
    data_folder = "../../../data/processed/graph_data_nohubs/merged_types/"
    experiments_folder = "../../../data/experiments/design_space_merged_experiment/"

Not running on Colab


In [2]:
import base_model, colab_utils
import pandas as pd
import itertools
import datetime
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from torch_geometric import seed_everything

# Load data

In [3]:
seed = 5
seed_everything(seed)
path = data_folder + "split_dataset/" + f"seed_{seed}/"
experiments_folder = experiments_folder + f"seed_{seed}/"
datasets,_ = colab_utils.load_data(path)
original_train_data, original_val_data = datasets
full_set = torch.load(path+"full_dataset.pt")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Experiments

## Run a single experiment

In [58]:
def run_experiment(params, train_set, val_set,negative_sampler):
    # Initialize node features
    if params["feature_type"] != "lsa":
        train_set = colab_utils.initialize_features(
            train_set, params["feature_type"], params["feature_dim"]
        )
        val_set = colab_utils.initialize_features(
            val_set, params["feature_type"], params["feature_dim"]
        )
    else:
        train_set = colab_utils.initialize_features(
            train_set, params["feature_type"], params["feature_dim"], path
        )
        val_set = colab_utils.initialize_features(
            val_set, params["feature_type"], params["feature_dim"], path
        )

    train_set.to(device)
    val_set.to(device)

    # Initialize model
    model = base_model.base_model(
        params, train_set.metadata(), params["supervision_types"]
    )
    model = model.to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"]
    )
    train_losses = []
    val_losses = []
    train_scores = []
    val_scores = []

    epochs = params["epochs"]

    early_stopper = colab_utils.EarlyStopper(params["patience"], params["delta"])
    train_label_index = train_set["gene_protein","gda","disease"]["edge_label_index"]
    for epoch in tqdm(range(epochs)):
        #Resample supervision links every k epochs
        if epoch%params["sample_epochs"] == 0:
            sample_index = torch.randint(high=train_label_index.shape[1], size=(round(params["sample_ratio"]*train_label_index.shape[1]),))
            positive_sample = train_label_index[:,sample_index]

            # positive_sample = train_label_index
            new_train_label_index, new_train_label = negative_sampler.get_labeled_tensors(positive_sample,"corrupt_both")
            train_set["gene_protein","gda","disease"]["edge_label_index"] = new_train_label_index
            train_set["gene_protein","gda","disease"]["edge_label"] = new_train_label

        train_loss = colab_utils.train(model, optimizer, train_set)
        val_loss = colab_utils.get_val_loss(model, val_set)

        train_score = colab_utils.test(model, train_set)
        val_score = colab_utils.test(model, val_set)

        train_losses.append(train_loss)
        train_scores.append(train_score)

        val_scores.append(val_score)
        val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            print("Early stopping")
            break

    val_auc = colab_utils.test(model, val_set)
    curve_data = [train_losses, val_losses, train_scores, val_scores]

    return val_auc, model, curve_data

## Run a grid of experiments

In [59]:
from sklearn.model_selection import ParameterGrid
def perform_hyperparameter_search(param_grid, train_set, val_set,negative_sampler):
  
  default = {
      "hidden_channels":[32],
      "conv_type":["SAGEConv"],
      "batch_norm": [True],
      "dropout":[0.1],
      "micro_aggregation":["mean"],
      "macro_aggregation":["mean"],
      "layer_connectivity":[None],
      "L2_norm":[False],
      "pre_process_layers":[0],
      "msg_passing_layers":[2],
      "post_process_layers":[0],
      "normalize_output":[False],
      "jumping_knowledge":[False],

      "feature_dim":[10],
      "feature_type":["random"],
      "supervision_types":[[('gene_protein', 'gda', 'disease')]],

      'weight_decay': [1e-3],
      'lr': [0.001],
      'epochs':[400],
      "patience":[10],
      "delta":[0.1]
  }

  for arg in default:
    if arg not in param_grid:
      param_grid[arg] = default[arg]

  grid = ParameterGrid(param_grid)

  auc_results = []
  models = []

  for eid,params in enumerate(grid):
    # Launch a training experiment using the current set of parameters
    val_auc,current_model,curve_data = run_experiment(
                   params,
                   train_set,
                   val_set,
                   negative_sampler)
    
    params["auc"] = val_auc
    params["curve_data"] = curve_data

    auc_results.append(params)
    models.append(current_model)

    print(f"Validation AUC: {round(val_auc,2)}. Iteration: {eid+1} of {grid.__len__()}")

  return auc_results, models

In [60]:
test_grid = {
    "hidden_channels": [32],
    "conv_type": ["SAGEConv"],
    "batch_norm": [True],
    "dropout": [0.1],
    "micro_aggregation": ["mean"],
    "macro_aggregation": ["max"],
    "layer_connectivity": [None],
    "L2_norm": [True],
    "pre_process_layers": [1],
    "msg_passing_layers": [4],
    "post_process_layers": [1],
    "normalize_output": [False],
    "jumping_knowledge": [True],
    "heads": [2],

    "feature_dim": [10],
    "feature_type": ["lsa"],
    "supervision_types": [[('gene_protein', 'gda', 'disease')]],

    'weight_decay': [1e-3],
    'lr': [0.001],
    'epochs': [400],
    "patience": [10],
    "delta": [0.1],
    
    "sample_epochs":[10],
    "sample_ratio":[0.8]
}

negative_sampler = colab_utils.NegativeSampler(full_set,("gene_protein","gda","disease"),full_set["gene_protein"]["degree_gda"],full_set["disease"]["degree_gda"])
auc, models = perform_hyperparameter_search(test_grid,original_train_data,original_val_data,negative_sampler)

100%|██████████| 400/400 [03:46<00:00,  1.76it/s]

Validation AUC: 0.82. Iteration: 1 of 1





# Pruebas que hice y quedaron, acomodar/sacar

Esto fue 4 capas pero sin el resampleo :o

In [22]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,max,mean,4,1,1,0.6


In [7]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,4,1,1,0.819


Esto 4 capas con JK sin la capa de pre process

In [17]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,4,1,0,0.669


Esto fue 4 capas con JK, sin skipsum

In [15]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,4,1,1,0.826


In [13]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,4,1,1,0.817


In [11]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,4,1,1,0.795


In [9]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,2,1,1,0.788


In [7]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,2,1,1,0.794


In [8]:
aver = pd.DataFrame(auc)
aver.sort_values(by="auc",ascending=False)[['macro_aggregation',
       'micro_aggregation', 'msg_passing_layers', "post_process_layers","pre_process_layers","auc"]]

Unnamed: 0,macro_aggregation,micro_aggregation,msg_passing_layers,post_process_layers,pre_process_layers,auc
0,sum,mean,2,1,1,0.794


## Run multiple grids of experiments

In [6]:
def run_multiple_grids(grid_list,train_data,val_data):
    all_results = []
    all_models = []

    date = datetime.datetime.now()
    fdate = date.strftime("%d_%m_%y__%H_%M_%S")
    df_name = experiments_folder+"experiment_"+fdate+".parquet"

    for i,grid in enumerate(grid_list):
        print(f"Experiment grid {i+1} of {len(grid_list)}")
        experiment_results, models = perform_hyperparameter_search(grid, train_data,val_data)
        results_df = pd.DataFrame(experiment_results)

        all_results.append(results_df)
        all_models.append(models)

        current_results = pd.concat(all_results).reset_index(drop=True)
        current_models =  list(itertools.chain(*all_models))

        print(f"Saving results from grid {i+1} ...")
        current_results.to_parquet(df_name)
        for i, model in enumerate(current_models):
            model_name = f"model_{i}"
            # colab_utils.save_model(model,experiments_folder,model_name)
            fname = f"{model_name}_{fdate}"
            torch.save(model.state_dict(), f"{experiments_folder}{fname}.pth")

In [75]:
default_grid = {
    "hidden_channels": [32],
    "conv_type": ["SAGEConv"],
    "batch_norm": [True],
    "dropout": [0.1],
    "micro_aggregation": ["sum"],
    "macro_aggregation": ["sum"],
    "layer_connectivity": [None],
    "L2_norm": [True],
    "pre_process_layers": [0],
    "msg_passing_layers": [2],
    "post_process_layers": [1],
    "normalize_output": [False],
    "jumping_knowledge": [False],
    "heads": [2],

    "feature_dim": [10],
    "feature_type": ["ones"],
    "supervision_types": [[('gene_protein', 'gda', 'disease')]],

    'weight_decay': [1e-3],
    'lr': [0.001],
    'epochs': [400],
    "patience": [10],
    "delta": [0.1],

    "experiment_name": ["default_experiment"]
}

grid_list = []

grid_1 = {"experiment_name": ["sage_aggr_type"],
          "micro_aggregation": ["sum", "mean", "max"],
          "macro_aggregation": ["sum", "mean", "max"],
          "feature_type": ["random", "ones"]
          }
grid_list.append(default_grid|grid_1)

grid_2 = {"experiment_name": ["GAT_aggr_type"],
          "micro_aggregation": ["sum", "mean", "max"],
          "macro_aggregation": ["sum", "mean", "max"],
          "feature_type": ["random", "ones"]}
grid_list.append(default_grid|grid_2)

grid_3 = {"experiment_name": ["channels_vs_feature"],
          "hidden_channels":[32,64,128],
          "feature_dim":[10,50,100],
          "feature_type":["ones","random"]}
grid_list.append(default_grid|grid_3)

grid_4 = {"experiment_name":["layers_connectivity"],
          "layer_connectivity":[None,"skipsum"],
          "msg_passing_layers":[2,3,4,5],
          "jumping_knowledge":[False,True],
          }
grid_list.append(default_grid|grid_4)


grid_5 = {"experiment_name":["normalization"],
          "L2_norm":[True,False],
          "normalize_output":[True,False],
          "conv_type":["SAGEConv","GATConv"]
          }
grid_list.append(default_grid|grid_5)

grid_6 = {"experiment_name":["pre_post_process"],
          "pre_process_layers":[0,1,2],
          "post_process_layers":[0,1,2],
          "normalize_output":[True,False],
          "feature_type":["ones","random"]}
grid_list.append(default_grid|grid_6)

grid_7 = {"experiment_name":["regularization"],
          "batch_norm":[True,False],
          "dropout":[0,0.1,0.01],
          "conv_type":["SAGEConv","GATConv"]
          }

num_experiments = sum(
    [np.prod([len(val) for val in grid.values()]) for grid in grid_list])

In [69]:
print(f"Running {num_experiments} experiments ...")
run_multiple_grids(grid_list,original_train_data,original_val_data)

Running 2 experiments ...
Experiment grid 1 of 2
Validation AUC: 0.9. Iteration: 1 of 1
Saving results from grid 1 ...
Experiment grid 2 of 2
Validation AUC: 0.9. Iteration: 1 of 1
Saving results from grid 2 ...
