In [1]:
import torch
import numpy
import sklearn
import random
import time
import torch.nn.functional as F
from IPython.display import Javascript
from torch.nn import Linear
from sklearn import preprocessing
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GraphConv, global_mean_pool
import matplotlib.pyplot as plt

random.seed = 88888888

In [2]:
device = cuda0 = torch.device('cuda:0')
cpu = torch.device('cpu')

edges_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input_rewired_network/random_rewired_edges.txt'
node_features_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input/node_features.txt'
graph_targets_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input/graph_targets.txt'

# magic numbers
INPUT_CHANNELS = 1
OUTPUT_CHANNELS = 26
HIDDEN_CHANNELS = 64
BATCH_SIZE = 64
EPOCHS = 500 #set this to 200 - 2000
BENCHMARKING = False

In [3]:
def read_reactome_graph(edges_fn, node_features_fn):
    edge_v1 = []
    edge_v2 = []

    for line in open(edges_fn, 'r'):
        data = line.split()
        node1 = int(data[0]) - 1 #subtracting to convert R idx to python idx
        node2 = int(data[1]) - 1 # " "
        edge_v1.append( node1 )
        edge_v2.append( node2 )

    return edge_v1, edge_v2

In [4]:
(edge_v1, edge_v2) = read_reactome_graph(edges_fn, node_features_fn)
print(edge_v1)
print(edge_v2)

[5502, 5061, 1504, 1869, 3218, 234, 4952, 6660, 2, 1020, 6425, 7645, 7137, 2486, 6527, 1845, 1837, 57, 1662, 6228, 674, 6961, 2874, 7823, 878, 6222, 2217, 1950, 352, 6159, 3994, 3492, 6372, 5727, 3998, 3163, 7240, 3499, 3712, 6282, 6269, 6013, 4048, 3174, 2436, 7105, 6585, 7484, 332, 2177, 6880, 1563, 2160, 143, 4850, 6179, 6849, 3466, 1883, 2348, 1360, 577, 7264, 3073, 6255, 3719, 3723, 3648, 3173, 943, 7015, 7146, 1018, 7277, 7591, 4809, 7424, 5897, 2815, 2059, 3741, 2881, 368, 1822, 3146, 2131, 5922, 1990, 5474, 1981, 6406, 3498, 1265, 3067, 84, 7367, 4176, 2900, 2316, 5188, 5445, 2288, 1490, 6013, 392, 7838, 6518, 2273, 5849, 3315, 2826, 2422, 3124, 5, 7409, 7588, 3639, 5988, 1698, 3539, 40, 3239, 5794, 4467, 752, 3899, 7184, 3046, 4236, 4997, 4073, 7161, 920, 1428, 1470, 3192, 682, 4063, 3926, 604, 2177, 470, 2508, 4918, 4951, 5356, 127, 5645, 1787, 1368, 4286, 1955, 1099, 757, 4561, 3697, 6978, 1472, 6467, 1769, 3517, 3910, 5789, 4979, 3177, 394, 4001, 4371, 1428, 5259, 1327, 375

In [5]:
def build_scratch_loader(batch_size):
  dataset = TUDataset(root='data/TUDataset', name='MUTAG')
  data_list = []
  for graph_obj in dataset:
    x = torch.tensor(graph_obj.x[:,1],dtype=torch.float)
    x = x.unsqueeze(1)
    y = graph_obj.y
    edge_index = graph_obj.edge_index
    data_list.append(Data(x = x, y = y, edge_index = edge_index))

  loader = DataLoader(data_list,batch_size=batch_size,shuffle=True)

  return loader

In [6]:
def build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn):
    edge_index = torch.tensor([edge_v1, edge_v2], dtype = torch.long)
    feature_v = numpy.loadtxt(node_features_fn)
    target_v = numpy.loadtxt(graph_targets_fn,dtype=str,delimiter=",")
    
    target_encoder = sklearn.preprocessing.LabelEncoder()
    target_v = target_encoder.fit_transform(target_v)
    
    print(len(feature_v))
    print(len(target_v))

    data_list = []
    for row_idx in range(len(feature_v)):
      features = feature_v[row_idx,:]
      x = torch.tensor(features,dtype=torch.float)
      x = x.unsqueeze(1)
      y = torch.tensor([target_v[row_idx]])
      data_list.append(Data(x = x, y = y, edge_index = edge_index))

    return data_list

def build_reactome_graph_loader(data_list,batch_size):

    loader = DataLoader(data_list,batch_size=batch_size,shuffle=True)

    return loader

In [7]:

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()

        self.conv1 = GraphConv(INPUT_CHANNELS, hidden_channels)
        self.conv2 = GraphConv(hidden_channels,hidden_channels)
        self.conv3 = GraphConv(hidden_channels,hidden_channels)
        self.lin = Linear(hidden_channels, OUTPUT_CHANNELS)

    def forward(self, x, edge_index, batch, edge_weight=None):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_weight)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, training=self.training)
        x = self.lin(x)
        
        return x

In [8]:
model = GNN(hidden_channels=HIDDEN_CHANNELS)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

def train(loader,device):
  model.train()

  for batch in loader:  # Iterate in batches over the training dataset.
    x = batch.x.to(device)
    e = batch.edge_index.to(device)
    b = batch.batch.to(device)
    y = batch.y.to(device)

    out = model(x, e, b)  # Perform a single forward pass.

    loss = criterion(out, y)  # Compute the loss.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    optimizer.zero_grad()  # Clear gradients.

def test(loader,device):
  model.eval()

  correct = 0
  for batch in loader:  # Iterate in batches over the training/test dataset.
    x = batch.x.to(device)
    e = batch.edge_index.to(device)
    b = batch.batch.to(device)
    y = batch.y.to(device)
    out = model(x, e, b)  # Perform a single forward pass.
    loss = criterion(out, y)  # Compute the loss.
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    correct += int((pred == y).sum())  # Check against ground-truth labels.
  return correct / len(loader.dataset)  # Derive ratio of correct predictions.

In [9]:
acc_str = ''
if(BENCHMARKING):

  test_b_sizes = [1,8,16,32,64,128]

  for test_b_size in test_b_sizes:
    print(f'Executing training routine with batch size = {test_b_size}')
    data_list = build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn)
    test_batch_size_data_loader = build_reactome_graph_loader(data_list,test_b_size)
  
    start = time.time()
    train(test_batch_size_data_loader,device)
    end = time.time()
    training_time = end - start

    start = time.time()
    train_acc = test(test_batch_size_data_loader,device)
    end = time.time()
    test_time = end - start

    acc_str += f'{train_acc:.4f}\n'
    print(f'Batch Size: {test_b_size}')
    print(f'Training Time: {training_time}')
    print(f'Test Time: {test_time}')
    print(f'Accuracy: {train_acc}')
    BENCHMARKING = False
else:
  data_list = build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn)
  random.shuffle(data_list)

  BENCHMARKING = True

6288
6288


In [10]:
if(BENCHMARKING):
  fold_size = 911
  fold = 'full_dataset'
#   >>> train =              z[:fold_size * (fold - 1)] +         z[fold_size * fold:]
#   train_data_list = data_list[:fold_size * (fold - 1)] + data_list[fold_size * fold:]
  #>>> test =              z[fold_size * (fold - 1):fold_size * fold]
  #test_data_list = data_list[fold_size * (fold - 1):fold_size * fold]
  train_data_list = data_list

  print(f'Number of training graphs: {len(train_data_list)}')
  #print(f'Number of test graphs: {len(test_data_list)}')
  train_data_loader = build_reactome_graph_loader(train_data_list,BATCH_SIZE)
  #test_data_loader = build_reactome_graph_loader(test_data_list,BATCH_SIZE)
  for epoch in range(EPOCHS):
    train(train_data_loader,device)
    train_acc = test(train_data_loader,device)
    #test_acc = test(test_data_loader,device) 
    acc_str += f'{train_acc:.4f}'#',{test_acc:.4f}\n'
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')#', Test Acc: {test_acc:.4f}')

  training_acc_fn = F"graph_classification_acc_rewired10_{fold}.txt"
  path = F"/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/rewired_network/{training_acc_fn}"
  with open(path, 'w') as writefile:
      writefile.write(acc_str)
  model_save_name = F"trained_pytorch_model_rewired10_fold_{fold}.pt"
  path = F"/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/rewired_network{model_save_name}" 
  torch.save(model.state_dict(), path)
  print(F"model saved as {path}")

Number of training graphs: 6288




Epoch: 000, Train Acc: 0.4823
Epoch: 001, Train Acc: 0.5259
Epoch: 002, Train Acc: 0.5337
Epoch: 003, Train Acc: 0.5253
Epoch: 004, Train Acc: 0.5383
Epoch: 005, Train Acc: 0.5623
Epoch: 006, Train Acc: 0.5701
Epoch: 007, Train Acc: 0.5829
Epoch: 008, Train Acc: 0.5784
Epoch: 009, Train Acc: 0.5903
Epoch: 010, Train Acc: 0.5984
Epoch: 011, Train Acc: 0.5824
Epoch: 012, Train Acc: 0.6037
Epoch: 013, Train Acc: 0.5989
Epoch: 014, Train Acc: 0.6056
Epoch: 015, Train Acc: 0.6053
Epoch: 016, Train Acc: 0.6059
Epoch: 017, Train Acc: 0.6069
Epoch: 018, Train Acc: 0.6070
Epoch: 019, Train Acc: 0.5918
Epoch: 020, Train Acc: 0.6088
Epoch: 021, Train Acc: 0.6048
Epoch: 022, Train Acc: 0.6089
Epoch: 023, Train Acc: 0.6128
Epoch: 024, Train Acc: 0.6169
Epoch: 025, Train Acc: 0.6175
Epoch: 026, Train Acc: 0.6204
Epoch: 027, Train Acc: 0.6193
Epoch: 028, Train Acc: 0.6143
Epoch: 029, Train Acc: 0.6066
Epoch: 030, Train Acc: 0.6247
Epoch: 031, Train Acc: 0.6244
Epoch: 032, Train Acc: 0.6233
Epoch: 033

Epoch: 274, Train Acc: 0.8119
Epoch: 275, Train Acc: 0.8015
Epoch: 276, Train Acc: 0.8057
Epoch: 277, Train Acc: 0.8098
Epoch: 278, Train Acc: 0.7971
Epoch: 279, Train Acc: 0.8162
Epoch: 280, Train Acc: 0.8117
Epoch: 281, Train Acc: 0.8085
Epoch: 282, Train Acc: 0.8163
Epoch: 283, Train Acc: 0.8026
Epoch: 284, Train Acc: 0.8195
Epoch: 285, Train Acc: 0.8080
Epoch: 286, Train Acc: 0.8211
Epoch: 287, Train Acc: 0.8090
Epoch: 288, Train Acc: 0.8047
Epoch: 289, Train Acc: 0.8115
Epoch: 290, Train Acc: 0.8152
Epoch: 291, Train Acc: 0.8061
Epoch: 292, Train Acc: 0.8165
Epoch: 293, Train Acc: 0.7977
Epoch: 294, Train Acc: 0.8170
Epoch: 295, Train Acc: 0.8022
Epoch: 296, Train Acc: 0.7915
Epoch: 297, Train Acc: 0.8152
Epoch: 298, Train Acc: 0.8179
Epoch: 299, Train Acc: 0.8216
Epoch: 300, Train Acc: 0.8219
Epoch: 301, Train Acc: 0.8173
Epoch: 302, Train Acc: 0.8206
Epoch: 303, Train Acc: 0.8271
Epoch: 304, Train Acc: 0.8217
Epoch: 305, Train Acc: 0.8103
Epoch: 306, Train Acc: 0.8228
Epoch: 307