# Setup

In [1]:
import torch

if 'google.colab' in str(get_ipython()):
  print('Running on Colab')
  running_on_colab = True
else:
  print('Not running on Colab')
  running_on_colab = False

if running_on_colab:
    print(torch.__version__)
    !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
    !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
    !pip install -q git+https://github.com/snap-stanford/deepsnap.git

    from google.colab import drive
    drive.mount('/content/drive')
    filepath = '/content/drive/MyDrive/GCNN/'
    data_folder = filepath+"graph_data/split_dataset/"
    feature_folder = data_folder
    experiments_folder = filepath+"experiments/hyperparameter_tuning/"
    import sys
    sys.path.append(filepath + "run_in_colab")

else:
    data_folder = "../../../data/processed/graph_data_nohubs/merged_types/split_dataset/"
    experiments_folder = "../../../reports/model_selection/hyperparameter_tuning/"
    feature_folder = "../../../data/processed/feature_data/"

Not running on Colab


In [6]:
import sage_lsa, colab_utils
from sklearn.model_selection import ParameterSampler
import pandas as pd
import numpy as np
from datetime import date
from torch_geometric import seed_everything

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed_everything(0)

In [3]:
seeds = [4,5,6,7,8]
data = []
for seed in seeds:
    datasets, node_map = colab_utils.load_data(data_folder+f"seed_{seed}/")
    data.append(datasets)

full_set = torch.load(data_folder+f"seed_{seeds[-1]}/full_dataset.pt")

In [4]:
def run_experiment(params, train_set, val_set,negative_sampler,feature_folder=feature_folder):
    # Initialize node features
    train_set = colab_utils.initialize_features(train_set, "lsa_scaled", 32, feature_folder)
    val_set = colab_utils.initialize_features(val_set, "lsa_scaled", 32, feature_folder)

    train_set.to(device)
    val_set.to(device)

    # Initialize model
    model = sage_lsa.Model(train_set.metadata(),[("gene_protein","gda","disease")],params["first_layer_dropout"])
    model = model.to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"]
    )
    train_losses = []
    val_losses = []
    # train_scores = []
    # val_scores = []

    epochs = 400
    patience = 10
    delta = 0.1

    early_stopper = colab_utils.EarlyStopper(patience, delta)
    train_label_index = train_set["gene_protein","gda","disease"]["edge_label_index"]

    for epoch in range(epochs):
        #Resample negative supervision links every epoch
        new_train_label_index, new_train_label = negative_sampler.get_labeled_tensors(train_label_index.cpu(),"corrupt_both")
        train_set["gene_protein","gda","disease"]["edge_label_index"] = new_train_label_index.to(device)
        train_set["gene_protein","gda","disease"]["edge_label"] = new_train_label.to(device)

        train_loss = colab_utils.train(model, optimizer, train_set)
        val_loss = colab_utils.get_val_loss(model, val_set)

        # train_score = colab_utils.test(model, train_set)
        # val_score = colab_utils.test(model, val_set)

        train_losses.append(train_loss)
        # train_scores.append(train_score)

        # val_scores.append(val_score)
        # val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            print("Early stopping")
            break

    val_auc = colab_utils.test(model, val_set)
    # curve_data = [train_losses, val_losses, train_scores, val_scores]
    final_val_CE = val_losses[-1]
    final_train_CE = train_losses[-1]

    return val_auc, final_val_CE, final_train_CE

def run_multiple_seeds(datasets,experiment_params,negative_sampler):
    experiment_auc = []
    experiment_val_CE = []
    experiment_train_CE = []
    # curves = []
    for seed_dataset in datasets:
        train_data, val_data = seed_dataset
        seed_auc, seed_val_CE, seed_train_CE = run_experiment(experiment_params,train_data,val_data,negative_sampler)
        experiment_auc.append(seed_auc)
        experiment_val_CE.append(seed_val_CE)
        experiment_train_CE.append(seed_train_CE)
    
    mean_auc = (np.mean(experiment_auc),np.std(experiment_auc))
    val_CE = (np.mean(experiment_val_CE),np.std(experiment_val_CE))
    train_CE = (np.mean(experiment_train_CE),np.std(experiment_train_CE))
    
    return [mean_auc, val_CE, train_CE]

In [7]:
grid_params = {
    "first_layer_dropout": [0.2,0.3,0.5],
    "weight_decay": [1e-3,1e-2,1e-1],
    "lr": [1e-3,1e-2,1e-1],
}


grid = ParameterSampler(grid_params, n_iter=10)
results = pd.DataFrame()

results = []

negative_sampler = colab_utils.NegativeSampler(full_set,("gene_protein","gda","disease"),full_set["gene_protein"]["degree_gda"],full_set["disease"]["degree_gda"])
for exp_id,params in enumerate(grid):
    mean_auc, val_CE, train_CE = run_multiple_seeds(data,params,negative_sampler)
    params["mean_auc"] = mean_auc[0]
    params["std"] = mean_auc[1]
    params["mean_val_CE"] = val_CE[0]
    params["val_CE_std"] = val_CE[1]
    params["mean_train_CE"] = train_CE[0]
    params["train_CE_std"] = train_CE[1]

results_df = pd.DataFrame(results)

fdate = date.strftime("%d_%m_%y__%H_%M")

results_df.to_csv(experiments_folder + "random_grid_search" + fdate + ".csv")

KeyboardInterrupt: 