In [1]:
import torch
import numpy
import sklearn
import random
import time
import torch.nn.functional as F
from IPython.display import Javascript
from torch.nn import Linear
from sklearn import preprocessing
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GraphConv, global_mean_pool
import matplotlib.pyplot as plt

random.seed = 88888888

In [2]:
device = cuda0 = torch.device('cuda:0')
cpu = torch.device('cpu')

edges_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input/edges.txt'
node_features_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input_shuffled_feature/shuffled_graph_features.txt'
graph_targets_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input/graph_targets.txt'

# magic numbers
INPUT_CHANNELS = 1
OUTPUT_CHANNELS = 26
HIDDEN_CHANNELS = 64
BATCH_SIZE = 64
EPOCHS = 500 #set this to 200 - 2000
BENCHMARKING = False

In [3]:
def read_reactome_graph(edges_fn, node_features_fn):
    edge_v1 = []
    edge_v2 = []

    for line in open(edges_fn, 'r'):
        data = line.split()
        node1 = int(data[0]) - 1 #subtracting to convert R idx to python idx
        node2 = int(data[1]) - 1 # " "
        edge_v1.append( node1 )
        edge_v2.append( node2 )

    return edge_v1, edge_v2

In [4]:
(edge_v1, edge_v2) = read_reactome_graph(edges_fn, node_features_fn)
print(edge_v1)
print(edge_v2)

def build_scratch_loader(batch_size):
  dataset = TUDataset(root='data/TUDataset', name='MUTAG')
  data_list = []
  for graph_obj in dataset:
    x = torch.tensor(graph_obj.x[:,1],dtype=torch.float)
    x = x.unsqueeze(1)
    y = graph_obj.y
    edge_index = graph_obj.edge_index
    data_list.append(Data(x = x, y = y, edge_index = edge_index))

  loader = DataLoader(data_list,batch_size=batch_size,shuffle=True)

  return loader

[5574, 2230, 2389, 2390, 2392, 2393, 2398, 496, 495, 4426, 3147, 3146, 3144, 4972, 4584, 4584, 4972, 5842, 4974, 4976, 6897, 5873, 2402, 6898, 2403, 6899, 6901, 2402, 6902, 6900, 6902, 1523, 1520, 2224, 5840, 5444, 2225, 5441, 2218, 5442, 5840, 2224, 2222, 2223, 2226, 2217, 2228, 3171, 3170, 3172, 3172, 6755, 6757, 6111, 6756, 6104, 6109, 6105, 6108, 2830, 2831, 1394, 6109, 1395, 2830, 6109, 5238, 6795, 2227, 2228, 6843, 3172, 6106, 2830, 3145, 4975, 1220, 2218, 1222, 5437, 2227, 1989, 1991, 1990, 1992, 4446, 5293, 132, 129, 142, 1388, 136, 1387, 3120, 4905, 2670, 4906, 4904, 4903, 2670, 6920, 2669, 2667, 5846, 5845, 2680, 2666, 2673, 2664, 2407, 2663, 2665, 2674, 2662, 2656, 2657, 2655, 2654, 2653, 7451, 7450, 6784, 2651, 2652, 2661, 2658, 2660, 2659, 2671, 2675, 2578, 2577, 2672, 2676, 3119, 1367, 3118, 4902, 3121, 5516, 3124, 2777, 3123, 2778, 2875, 145, 1386, 1393, 3694, 1385, 4890, 4891, 1391, 3118, 6890, 133, 6037, 144, 143, 134, 4890, 145, 133, 130, 131, 132, 3118, 6888, 2941, 4

In [5]:
def build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn):
    edge_index = torch.tensor([edge_v1, edge_v2], dtype = torch.long)
    feature_v = numpy.loadtxt(node_features_fn)
    target_v = numpy.loadtxt(graph_targets_fn,dtype=str,delimiter=",")
    
    target_encoder = sklearn.preprocessing.LabelEncoder()
    target_v = target_encoder.fit_transform(target_v)
    
    print(len(feature_v))
    print(len(target_v))

    data_list = []
    for row_idx in range(len(feature_v)):
      features = feature_v[row_idx,:]
      x = torch.tensor(features,dtype=torch.float)
      x = x.unsqueeze(1)
      y = torch.tensor([target_v[row_idx]])
      data_list.append(Data(x = x, y = y, edge_index = edge_index))

    return data_list

def build_reactome_graph_loader(data_list,batch_size):

    loader = DataLoader(data_list,batch_size=batch_size,shuffle=True)

    return loader

In [6]:

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()

        self.conv1 = GraphConv(INPUT_CHANNELS, hidden_channels)
        self.conv2 = GraphConv(hidden_channels,hidden_channels)
        self.conv3 = GraphConv(hidden_channels,hidden_channels)
        self.lin = Linear(hidden_channels, OUTPUT_CHANNELS)

    def forward(self, x, edge_index, batch, edge_weight=None):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_weight)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, training=self.training)
        x = self.lin(x)
        
        return x

In [7]:
model = GNN(hidden_channels=HIDDEN_CHANNELS)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

def train(loader,device):
  model.train()

  for batch in loader:  # Iterate in batches over the training dataset.
    x = batch.x.to(device)
    e = batch.edge_index.to(device)
    b = batch.batch.to(device)
    y = batch.y.to(device)
    
    out = model(x, e, b)  # Perform a single forward pass.

    loss = criterion(out, y)  # Compute the loss.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    optimizer.zero_grad()  # Clear gradients.

def test(loader,device):
  model.eval()

  correct = 0
  for batch in loader:  # Iterate in batches over the training/test dataset.
    x = batch.x.to(device)
    e = batch.edge_index.to(device)
    b = batch.batch.to(device)
    y = batch.y.to(device)
    out = model(x, e, b)  # Perform a single forward pass.
    loss = criterion(out, y)  # Compute the loss.
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    correct += int((pred == y).sum())  # Check against ground-truth labels.
  return correct / len(loader.dataset)  # Derive ratio of correct predictions.

In [8]:
acc_str = ''
if(BENCHMARKING):

  test_b_sizes = [1,8,16,32,64,128]

  for test_b_size in test_b_sizes:
    print(f'Executing training routine with batch size = {test_b_size}')
    data_list = build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn)
    test_batch_size_data_loader = build_reactome_graph_loader(data_list,test_b_size)
  
    start = time.time()
    train(test_batch_size_data_loader,device)
    end = time.time()
    training_time = end - start

    start = time.time()
    train_acc = test(test_batch_size_data_loader,device)
    end = time.time()
    test_time = end - start

    acc_str += f'{train_acc:.4f}\n'
    print(f'Batch Size: {test_b_size}')
    print(f'Training Time: {training_time}')
    print(f'Test Time: {test_time}')
    print(f'Accuracy: {train_acc}')
    BENCHMARKING = False
else:
  data_list = build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn)
  random.shuffle(data_list)

  BENCHMARKING = True

6288
6288


In [None]:
if(BENCHMARKING):
  fold_size = 911
  fold = 'full_dataset'
#   >>> train =              z[:fold_size * (fold - 1)] +         z[fold_size * fold:]
#   train_data_list = data_list[:fold_size * (fold - 1)] + data_list[fold_size * fold:]
  #>>> test =              z[fold_size * (fold - 1):fold_size * fold]
  #test_data_list = data_list[fold_size * (fold - 1):fold_size * fold]
  train_data_list = data_list

  print(f'Number of training graphs: {len(train_data_list)}')
  #print(f'Number of test graphs: {len(test_data_list)}')
  train_data_loader = build_reactome_graph_loader(train_data_list,BATCH_SIZE)
  #test_data_loader = build_reactome_graph_loader(test_data_list,BATCH_SIZE)
  for epoch in range(EPOCHS):
    train(train_data_loader,device)
    train_acc = test(train_data_loader,device)
    #test_acc = test(test_data_loader,device) 
    acc_str += f'{train_acc:.4f}'#',{test_acc:.4f}\n'
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')#', Test Acc: {test_acc:.4f}')

  training_acc_fn = F"graph_classification_acc_rewired10_{fold}.txt"
  path = F"/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/Shuffled_Features/{training_acc_fn}"
  with open(path, 'w') as writefile:
      writefile.write(acc_str)
  model_save_name = F"trained_pytorch_model_rewired10_fold_{fold}.pt"
  path = F"/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/Shuffled_Features/{model_save_name}" 
  torch.save(model.state_dict(), path)
  print(F"model saved as {path}")

Number of training graphs: 6288




Epoch: 000, Train Acc: 0.2764
Epoch: 001, Train Acc: 0.2764
Epoch: 002, Train Acc: 0.2764
Epoch: 003, Train Acc: 0.2764
Epoch: 004, Train Acc: 0.2764
Epoch: 005, Train Acc: 0.2764
Epoch: 006, Train Acc: 0.2764
Epoch: 007, Train Acc: 0.2764
Epoch: 008, Train Acc: 0.2764
Epoch: 009, Train Acc: 0.2764
Epoch: 010, Train Acc: 0.2764
Epoch: 011, Train Acc: 0.2764
Epoch: 012, Train Acc: 0.2764
Epoch: 013, Train Acc: 0.2764
Epoch: 014, Train Acc: 0.2764
Epoch: 015, Train Acc: 0.2764
Epoch: 016, Train Acc: 0.2764
Epoch: 017, Train Acc: 0.2764
Epoch: 018, Train Acc: 0.2764
Epoch: 019, Train Acc: 0.2764
Epoch: 020, Train Acc: 0.2764
Epoch: 021, Train Acc: 0.2764
Epoch: 022, Train Acc: 0.2764
Epoch: 023, Train Acc: 0.2764
Epoch: 024, Train Acc: 0.2764
Epoch: 025, Train Acc: 0.2764
Epoch: 026, Train Acc: 0.2764
Epoch: 027, Train Acc: 0.2764
Epoch: 028, Train Acc: 0.2764
Epoch: 029, Train Acc: 0.2764
Epoch: 030, Train Acc: 0.2764
Epoch: 031, Train Acc: 0.2764
Epoch: 032, Train Acc: 0.2764
Epoch: 033