In [1]:
import os.path as op
import random
import time

import matplotlib.pyplot as plt
import numpy
import sklearn
import torch
import torch.nn.functional as nn_func
from sklearn import preprocessing
from sklearn.metrics import adjusted_rand_score
from torch.nn import Linear
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GraphConv, global_mean_pool

random.seed = 88888888

In [2]:
edges_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input_summation/edges.txt'
node_features_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input_summation/node_features.txt'
graph_targets_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/input_summation/graph_targets.txt'
output_fn = '/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/output/gnn_predictions_summation.tsv'

features_exist = op.exists(node_features_fn)
targets_exist = op.exists(graph_targets_fn)
edges_exist = op.exists(edges_fn)

print(f'features exist: {features_exist},'
      f' targets exist: {targets_exist},'
      f' edges exist: {edges_exist}')
assert features_exist
assert targets_exist
assert edges_exist

# magic numbers
INPUT_CHANNELS = 1
OUTPUT_CHANNELS = 26
HIDDEN_CHANNELS = 64
BATCH_SIZE = 64
EPOCHS = 500 #set this to 200 - 2000
BENCHMARKING = False

features exist: True, targets exist: True, edges exist: True


In [3]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()

        self.conv1 = GraphConv(INPUT_CHANNELS, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, OUTPUT_CHANNELS)

    def forward(self, x, edge_index, batch, edge_weight=None):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_weight)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_weight)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = nn_func.dropout(x, training=self.training)
        x = self.lin(x)

        return x


def read_reactome_graph(e_fn):
    e_v1 = []
    e_v2 = []

    for line in open(e_fn, 'r'):
        dt = line.split()
        node1 = int(dt[0]) - 1  # subtracting to convert R idx to python idx
        node2 = int(dt[1]) - 1  # " "
        e_v1.append(node1)
        e_v2.append(node2)

    return e_v1, e_v2

In [4]:
def build_reactome_graph_datalist(e_v1, e_v2, n_fn, g_fn):
    edge_index = torch.tensor([e_v1, e_v2], dtype=torch.long)
    feature_v = numpy.loadtxt(n_fn)
    target_v = numpy.loadtxt(g_fn, dtype=str, delimiter=",")

    target_encoder = sklearn.preprocessing.LabelEncoder()
    target_v = target_encoder.fit_transform(target_v)

    d_list = []
    for row_idx in range(len(feature_v)):
        features = feature_v[row_idx, :]
        x = torch.tensor(features, dtype=torch.float)
        x = x.unsqueeze(1)
        y = torch.tensor([target_v[row_idx]])
        d_list.append(Data(x=x, y=y, edge_index=edge_index))

    return d_list


def build_reactome_graph_loader(d_list, batch_size):
    loader = DataLoader(d_list, batch_size=batch_size, shuffle=False)#True)

    return loader


def train(loader, dv):
    model.train()

    correct = 0
    for batch in loader:  # Iterate in batches over the training dataset.
        x = batch.x.to(dv)
        e = batch.edge_index.to(dv)
        b = batch.batch.to(dv)
        y = batch.y.to(dv)
        out = model(x, e, b)  # Perform a single forward pass.
        loss = criterion(out, y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


def test(loader, dv):
    model.eval()

    targets = []
    predictions = []
    for batch in loader:  # Iterate in batches over the test dataset.
        x = batch.x.to(dv)
        e = batch.edge_index.to(dv)
        b = batch.batch.to(dv)
        y = batch.y.to(dv)
        targets += torch.Tensor.tolist(y)
        out = model(x, e, b)  # Perform a single forward pass.
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        predictions += torch.Tensor.tolist(pred)
    print(targets)
    print(predictions)
    numpy.savetxt(output_fn, numpy.transpose([targets, predictions]),
                  fmt='%d', delimiter='\t', header='target\tprediction')
    ari = adjusted_rand_score(targets, predictions)
    print(f'ari: {ari}')
    return ari

In [None]:
def change_key(self, old, new):
    for _ in range(len(self)):
        k, v = self.popitem(False)
        self[new if old == k else k] = v


(edge_v1, edge_v2) = read_reactome_graph(edges_fn)
model = GNN(hidden_channels=HIDDEN_CHANNELS)
device = cpu = torch.device('cpu')


optimizer = torch.optim.AdamW(model.parameters())
criterion = torch.nn.CrossEntropyLoss()
acc_str = ''

data_list = build_reactome_graph_datalist(edge_v1, edge_v2, node_features_fn, graph_targets_fn)
print(len(data_list))
# retrain model for fine tuning transfer learning
train_data_list = data_list
print(len(train_data_list))
print(f'Number of training graphs: {len(train_data_list)}')
train_data_loader = build_reactome_graph_loader(train_data_list, BATCH_SIZE)
for epoch in range(EPOCHS):
    #print(f'epoch loop')
    train_acc = train(train_data_loader, device)
    print(f'Epoch: {epoch}, Train Acc: {train_acc}')
    acc_str += f'{train_acc:.4f}'#',{test_acc:.4f}\n'
    if train_acc == 1.0:
        break
        
training_acc_fn = F"summation_graph_classification_acc_full_dataset.txt"
path = F"/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/Summation/{training_acc_fn}"
with open(path, 'w') as writefile:
  writefile.write(acc_str)

test_data_list = data_list
print(len(test_data_list))
print(f'Number of test graphs: {len(test_data_list)}')

test_data_loader = build_reactome_graph_loader(test_data_list, BATCH_SIZE)
test_ari = test(test_data_loader, device)
print(f'test_ari: {test_ari}')

model_save_name = f'summation_fully_trained_pytorch_GEO_model_training_gnn_model.pt'
path = f'/mnt/home/yuankeji/RanceLab/reticula_new/reticula/data/GEO_model_training/GNN/Summation/{model_save_name}'
torch.save(model.state_dict(), path)
print(f'model saved as {path}')

6295
6295
Number of training graphs: 6295




Epoch: 0, Train Acc: 0.22081016679904686
Epoch: 1, Train Acc: 0.24463860206513105
Epoch: 2, Train Acc: 0.2581413820492454
Epoch: 3, Train Acc: 0.264813343923749
Epoch: 4, Train Acc: 0.2681493248610008
Epoch: 5, Train Acc: 0.27768069896743447
Epoch: 6, Train Acc: 0.28228752978554406
Epoch: 7, Train Acc: 0.2942017474185862
Epoch: 8, Train Acc: 0.29864972200158857
Epoch: 9, Train Acc: 0.30325655281969816
Epoch: 10, Train Acc: 0.30945194598888004
Epoch: 11, Train Acc: 0.3173947577442415
Epoch: 12, Train Acc: 0.32867355043685464
Epoch: 13, Train Acc: 0.33804606830818107
Epoch: 14, Train Acc: 0.35409054805401113
Epoch: 15, Train Acc: 0.356791104050834
Epoch: 16, Train Acc: 0.3787132644956315
Epoch: 17, Train Acc: 0.3925337569499603
Epoch: 18, Train Acc: 0.40365369340746626
Epoch: 19, Train Acc: 0.40778395552025415
Epoch: 20, Train Acc: 0.4103256552819698
Epoch: 21, Train Acc: 0.42748212867355045
Epoch: 22, Train Acc: 0.42875297855440825
Epoch: 23, Train Acc: 0.43272438443208894
Epoch: 24, Tr