In [41]:
import pandas as pd
import numpy as np
import pyg
import scipy.sparse as sp
import torch
from torch_geometric.data import Data
#from torch_geometric.loader import DataLoader
import torch_geometric

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
import torch.nn.functional as F

from pathcnn_notebook_utils import data_prep_pathcnn_paper,create_overlap_adjacency,prepare_vanilla_loader,prepare_cnn_loader




In [27]:
pca_exp = pd.read_excel("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/multiomics/PCA_EXP.xlsx", header=None)
pca_cnv = pd.read_excel("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/multiomics/PCA_CNV.xlsx", header=None)
pca_mt = pd.read_excel("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/multiomics/PCA_MT.xlsx", header=None)
ordered_pathways = pd.read_excel("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/multiomics/ordered_pathway_146_2pc.xlsx")
clinical = pd.read_excel("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/multiomics/Clinical.xlsx")
pathway_mask =pd.read_csv("/Users/alexandermollers/Documents/GitHub/survival_analysis/data/pathway_mask.csv")

In [28]:
kegg_pathway_mask = pathway_mask[pathway_mask["Unnamed: 0"].str.contains("KEGG")].copy()
kegg_pathway_mask.rename(columns={'Unnamed: 0':'pathway'}, inplace=True)
kegg_pathway_mask["pathway"] = kegg_pathway_mask["pathway"].str.replace(r'KEGG_','')
cnn_pthws = ordered_pathways.set_index("TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY").join(kegg_pathway_mask.set_index("pathway"))

### Data Preparation

In [42]:
ind,v = create_overlap_adjacency(cnn_pthws) #edge indexes for gnn, edge weights calculated based on pathway overlap
prepared_data, age, outcomes = data_prep_pathcnn_paper(pca_exp,pca_cnv,pca_mt,clinical)

### Data Splits

In [30]:
n_samples = len(prepared_data)
indices = np.arange(n_samples)
X_train, X_test, y_train, y_test,train_ind,test_ind = train_test_split(prepared_data, outcomes,indices,
                                                    stratify=outcomes, 
                                                    test_size=0.2)

X_train, X_validation, y_train, y_validation,train_ind,val_ind = train_test_split(X_train, y_train,train_ind,
                                                    stratify=y_train, 
                                                    test_size=0.2)

age_train = torch.tensor(age[train_ind])
age_val = torch.tensor(age[val_ind])
age_test = torch.tensor(age[test_ind])



In [31]:
van_loader_params = {'batch_size': 64,
          'shuffle': True}

cnn_loader_params = {'batch_size': 64,
          'shuffle': True}

van_train_loader = prepare_vanilla_loader(X_train,age_train,y_train,van_loader_params)
van_val_loader = prepare_vanilla_loader(X_validation,age_val,y_validation,van_loader_params)
van_test_loader = prepare_vanilla_loader(X_test,age_test,y_test,van_loader_params)

img_rows, img_cols = 146, 6
cnn_train_loader = prepare_cnn_loader(X_train,age_train,y_train,img_rows,img_cols,cnn_loader_params)
cnn_val_loader = prepare_cnn_loader(X_validation,age_val,y_validation,img_rows,img_cols,cnn_loader_params)
cnn_test_loader = prepare_cnn_loader(X_test,age_test,y_test,img_rows,img_cols,cnn_loader_params)


train_data_list = [Data(x = X_train[i],edge_index = ind,edge_weights = v, age = age_train[i], y = y_train[i]) for i in range(len(X_train))]
val_data_list = [Data(x = X_validation[i],edge_index = ind,edge_weights = v, age = age_val[i], y = y_validation[i]) for i in range(len(X_validation))]
test_data_list = [Data(x = X_test[i],edge_index = ind,edge_weights = v,age = age_test[i], y = y_test[i]) for i in range(len(X_test))]

gnn_train_loader = torch_geometric.loader.DataLoader(train_data_list, batch_size=1)
gnn_val_loader = torch_geometric.loader.DataLoader(val_data_list, batch_size=1)
gnn_test_loader = torch_geometric.loader.DataLoader(test_data_list, batch_size=1)

## Vanilla NN

In [32]:
class VanillaNN(nn.Module):

    def __init__(self):
        super(VanillaNN, self).__init__()
        
        self.fc1 = nn.Linear(876, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(65, 32)
        self.fc4 = nn.Linear(32, 2)
        

    def forward(self, x,clinical_vars):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x_cat = torch.cat((x, clinical_vars.unsqueeze(-1)),1)
        x = F.relu(self.fc3(x_cat))
        x = F.relu(self.fc4(x))
        
        return x
                          
                          



In [33]:

def main_vanilla(model,train_loader,val_loader):
    

    # init model
    model = cpath_md_lg(5567,860, 100, 30,args,mask = pathway_mask)
    if args.cuda:
        model.cuda()
        # init optimizer
    optimizer = optim.Adam(model.parameters(),lr=args.lr)

    for epoch in range(1, args.epochs + 1):
        loss_train_score, c_index_train_score,pll_train = train(args, model, cpath_train_loader, epoch, optimizer)
        loss_val_score,c_index_val_score,pll_val = validate(args, cpath_val_loader, model, epoch)
        test_conc_metric,test_pll = test(args, model, cpath_test_loader)

        epoch_dict = {

            'epoch': epoch + 1,
            'lr': args.lr,
            'gp_mean': args.gp_mean,
            'gp_var': args.gp_var,
            'loss_train_score': loss_train_score,
            'train_pll': pll_train,
            'ctrain_score': c_index_train_score,
            'cval_score': c_index_val_score,
            'loss_val_score': loss_val_score,
            'val_pll': pll_val,
            'test_conc_metric': test_conc_metric,
            'test_pll': test_pll,
            'state_dict': model.state_dict()
        }
        is_best = loss_val_score < best_loss_val_score
        best_loss_val_score = min(loss_val_score, best_loss_val_score)
        if args.save_best_model & is_best:
                save_checkpoint(
                    epoch_dict,
                    is_best,
                    filename=os.path.join(
                        args.save_dir,
                        'bayesian_{}.pth'.format(args.arch)))

        epoch_dict.pop("state_dict", None)
        epoch_log = {k: [v] for k, v in epoch_dict.items()}
        epoch_log_df = pd.DataFrame.from_dict(epoch_log, orient="columns")
        log_file_name = args.arch + '_logs.csv'
        log_path = os.path.join(args.log_dir, log_file_name)
        epoch_log_df.to_csv(log_path, mode='a', header=not os.path.exists(log_path),index = False)

    return epoch_log_df.to_string(header=False, index=False)

In [34]:
torch.manual_seed(2)
from IPython.display import Javascript

model = VanillaNN()
model.double()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0006,betas=(0.9, 0.999))

class_weights=[1,4.2]
class_weights=torch.tensor(class_weights,dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights,reduction='mean')

def train_van(loader):
    model.train()

    for x,clin_vars,y in loader:  # Iterate in batches over the training dataset.
         out = model(x, clin_vars)  # Perform a single forward pass.
         loss = criterion(out.float(), y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test_van(loader):
     model.eval()

     correct = 0
     predicted = []
     true_label = []
     for x,clin_vars,y in loader:  # Iterate in batches over the training/test dataset.
         out = model(x,clin_vars)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         predicted.extend(pred.detach().cpu().numpy().tolist())
         true_label.extend(y.detach().cpu().numpy().tolist())
         correct += int((pred == y).sum())  # Check against ground-truth labels.
   
     test_auc = roc_auc_score(true_label, predicted)
     print(test_auc)
    
          
        
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 50):
    train_van(van_train_loader)
    train_acc = test_van(van_train_loader)
    test_acc = test_van(van_test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')



0.6361969111969111
0.6247582205029014
Epoch: 001, Train Acc: 0.4645, Test Acc: 0.4483
0.6368725868725869
0.5938104448742747
Epoch: 002, Train Acc: 0.8361, Test Acc: 0.7931
0.6402509652509653
0.5938104448742747
Epoch: 003, Train Acc: 0.8415, Test Acc: 0.7931
0.6797297297297297
0.5831721470019342
Epoch: 004, Train Acc: 0.8525, Test Acc: 0.7759
0.7233590733590733
0.6982591876208898
Epoch: 005, Train Acc: 0.8525, Test Acc: 0.7931
0.8004826254826255
0.6769825918762089
Epoch: 006, Train Acc: 0.8361, Test Acc: 0.7586
0.8482625482625482
0.6663442940038684
Epoch: 007, Train Acc: 0.8251, Test Acc: 0.7414
0.8557915057915058
0.7224371373307543
Epoch: 008, Train Acc: 0.8197, Test Acc: 0.7759
0.8659266409266408
0.7117988394584138
Epoch: 009, Train Acc: 0.8361, Test Acc: 0.7586
0.903088803088803
0.7224371373307543
Epoch: 010, Train Acc: 0.8962, Test Acc: 0.7759
0.9132239382239381
0.6982591876208898
Epoch: 011, Train Acc: 0.9126, Test Acc: 0.7931
0.9132239382239381
0.6876208897485493
Epoch: 012, Train

In [35]:
def main():
    global best_loss_val_score

    pathway_mask = pd.read_csv("../data/pathway_mask.csv", index_col=0).values
    pathway_mask = torch.from_numpy(pathway_mask).type(torch.FloatTensor)
    if args.cuda:
        pathway_mask=pathway_mask.to(device = 'cuda')

    train_data = pd.read_csv("../data/train.csv")
    X_train_np = train_data.drop(["SAMPLE_ID", "OS_MONTHS", "OS_EVENT", "AGE"], axis=1).values
    tb_train = train_data.loc[:, ["OS_MONTHS"]].values
    e_train = train_data.loc[:, ["OS_EVENT"]].values
    clinical_vars_train = train_data.loc[:, ["AGE"]].values

    val_data = pd.read_csv("../data/validation.csv")
    X_val_np = val_data.drop(["SAMPLE_ID", "OS_MONTHS", "OS_EVENT", "AGE"], axis=1).values
    tb_val = val_data.loc[:, ["OS_MONTHS"]].values
    e_val = val_data.loc[:, ["OS_EVENT"]].values
    clinical_vars_val = val_data.loc[:, ["AGE"]].values

    test_data = pd.read_csv("../data/test.csv")
    X_test_np = test_data.drop(["SAMPLE_ID", "OS_MONTHS", "OS_EVENT", "AGE"], axis=1).values
    tb_test = test_data.loc[:, ["OS_MONTHS"]].values
    e_test = test_data.loc[:, ["OS_EVENT"]].values
    clinical_vars_test = test_data.loc[:, ["AGE"]].values

    cpath_train_dataset = cpath_dataset(X_train_np,
                                        clinical_vars_train,
                                        tb_train,
                                        e_train)

    cpath_val_dataset = cpath_dataset(X_val_np,
                                      clinical_vars_val,
                                      tb_val,
                                      e_val)
    cpath_test_dataset = cpath_dataset(X_test_np,
                                      clinical_vars_test,
                                      tb_test,
                                      e_test)

    # import data
    cpath_train_loader = torch.utils.data.DataLoader(cpath_train_dataset,
                                                     batch_size=len(cpath_train_dataset),
                                                     shuffle=True,
                                                     num_workers=0)

    cpath_val_loader = torch.utils.data.DataLoader(cpath_val_dataset,
                                                   batch_size=len(cpath_val_dataset),
                                                   shuffle=False,
                                                   num_workers=0)
    cpath_test_loader = torch.utils.data.DataLoader(cpath_test_dataset,
                                                   batch_size=len(cpath_test_dataset),
                                                   shuffle=False,
                                                   num_workers=0)


    # init model
    model = cpath_md_lg(5567,860, 100, 30,args,mask = pathway_mask)
    if args.cuda:
        model.cuda()
        # init optimizer
    optimizer = optim.Adam(model.parameters(),lr=args.lr)

    for epoch in range(1, args.epochs + 1):
        loss_train_score, c_index_train_score,pll_train = train(args, model, cpath_train_loader, epoch, optimizer)
        loss_val_score,c_index_val_score,pll_val = validate(args, cpath_val_loader, model, epoch)
        test_conc_metric,test_pll = test(args, model, cpath_test_loader)

        epoch_dict = {

            'epoch': epoch + 1,
            'lr': args.lr,
            'gp_mean': args.gp_mean,
            'gp_var': args.gp_var,
            'loss_train_score': loss_train_score,
            'train_pll': pll_train,
            'ctrain_score': c_index_train_score,
            'cval_score': c_index_val_score,
            'loss_val_score': loss_val_score,
            'val_pll': pll_val,
            'test_conc_metric': test_conc_metric,
            'test_pll': test_pll,
            'state_dict': model.state_dict()
        }
        is_best = loss_val_score < best_loss_val_score
        best_loss_val_score = min(loss_val_score, best_loss_val_score)
        if args.save_best_model & is_best:
                save_checkpoint(
                    epoch_dict,
                    is_best,
                    filename=os.path.join(
                        args.save_dir,
                        'bayesian_{}.pth'.format(args.arch)))

        epoch_dict.pop("state_dict", None)
        epoch_log = {k: [v] for k, v in epoch_dict.items()}
        epoch_log_df = pd.DataFrame.from_dict(epoch_log, orient="columns")
        log_file_name = args.arch + '_logs.csv'
        log_path = os.path.join(args.log_dir, log_file_name)
        epoch_log_df.to_csv(log_path, mode='a', header=not os.path.exists(log_path),index = False)

    return epoch_log_df.to_string(header=False, index=False)

### PATH CNN

In [36]:
class PathCNN(nn.Module):

    def __init__(self):
        super(PathCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, kernel_size = (3,3),padding = "same")
        self.conv2 = nn.Conv2d(32, 64, kernel_size = (3,3),padding = "same")
        self.dropout1 = nn.Dropout(0.25)
        self.flat = nn.Flatten()

        self.fc1 = nn.Linear(6913, 64)  
        self.dropout2 = nn.Dropout(0.5)
        self.lin = nn.Linear(64, 2)

    def forward(self, x,clinical_vars):
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, (4, 2))
        x = self.dropout1(x)
        x = self.flat(x)
        x_cat = torch.cat((x, clinical_vars.unsqueeze(-1)),1)
        x = F.relu(self.fc1(x_cat))
        x = self.dropout2(x)
        x = self.lin(x)
        
        return x
                          
                          



In [37]:
pathcnn = PathCNN()
pathcnn.double()



PathCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (dropout1): Dropout(p=0.25, inplace=False)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=6913, out_features=64, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

In [38]:
from IPython.display import Javascript

model = PathCNN()
model.double()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001,betas=(0.9, 0.999))

class_weights=[1,4.2]
class_weights=torch.tensor(class_weights,dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights,reduction='mean')

def train_cnn(loader):
    model.train()

    for x,clin_vars,y in loader:  # Iterate in batches over the training dataset.
         out = model(x, clin_vars)  # Perform a single forward pass.
         loss = criterion(out.float(), y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test_cnn(loader):
     model.eval()

     correct = 0
     predicted = []
     true_label = []
     for x,clin_vars,y in loader:  # Iterate in batches over the training/test dataset.
         out = model(x,clin_vars)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         predicted.extend(pred.detach().cpu().numpy().tolist())
         true_label.extend(y.detach().cpu().numpy().tolist())
         correct += int((pred == y).sum())  # Check against ground-truth labels.
   
     test_auc = roc_auc_score(true_label, predicted)
     print(test_auc)
    
          
        
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 30):
    train_cnn(cnn_train_loader)
    train_acc = test_cnn(cnn_train_loader)
    test_acc = test_cnn(cnn_test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')



0.6024131274131274
0.5473887814313347
Epoch: 001, Train Acc: 0.4098, Test Acc: 0.3793
0.6994208494208494
0.6905222437137332
Epoch: 002, Train Acc: 0.7432, Test Acc: 0.7241
0.6745173745173745
0.6179883945841393
Epoch: 003, Train Acc: 0.8087, Test Acc: 0.7759
0.7046332046332047
0.6315280464216635
Epoch: 004, Train Acc: 0.7869, Test Acc: 0.7414
0.7708494208494208
0.6663442940038684
Epoch: 005, Train Acc: 0.7705, Test Acc: 0.7414
0.7640926640926642
0.6450676982591876
Epoch: 006, Train Acc: 0.7596, Test Acc: 0.7069
0.7716216216216216
0.6344294003868471
Epoch: 007, Train Acc: 0.7541, Test Acc: 0.6897
0.7926640926640929
0.6450676982591876
Epoch: 008, Train Acc: 0.7705, Test Acc: 0.7069
0.7828185328185329
0.6315280464216635
Epoch: 009, Train Acc: 0.8251, Test Acc: 0.7414
0.797876447876448
0.620889748549323
Epoch: 010, Train Acc: 0.8142, Test Acc: 0.7241
0.7716216216216216
0.6450676982591876
Epoch: 011, Train Acc: 0.7541, Test Acc: 0.7069
0.7664092664092663
0.6934235976789168
Epoch: 012, Train 

In [39]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(6, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.dense = Linear(hidden_channels+1, hidden_channels+1)
        self.lin = Linear(hidden_channels+1, 2)

    def forward(self, x, edge_index,edge_weights,clinical_vars,batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index,edge_weights)
        x = x.relu()
        x = self.conv2(x, edge_index,edge_weights)
        x = x.relu()
        x = self.conv3(x, edge_index,edge_weights)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
        

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x_cat = torch.cat((x, clinical_vars.unsqueeze(1)),1)
        x = self.lin(x_cat)
        return x

In [40]:
for d in train_loader:
    testd = d
#model = GCN(hidden_channels=64)
#model.double()
#l = model(testd.x, testd.edge_index,testd.edge_weights,testd.age, testd.batch)

NameError: name 'train_loader' is not defined

In [None]:
from IPython.display import Javascript

model = GCN(hidden_channels=64)
model.double()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

class_weights=compute_class_weight(class_weight = 'balanced',classes = np.unique(y_train),y = y_train.numpy())
class_weights=torch.tensor(class_weights,dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights,reduction='mean')

def train(loader):
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index,data.edge_weights,data.age, data.batch)  # Perform a single forward pass.
         loss = criterion(out.float(), data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     predicted = []
     true_label = []
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.edge_weights,data.age,data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         predicted.append(pred.detach().cpu().numpy().item())
         true_label.append(data.y.detach().cpu().numpy().item())
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
   
     test_auc = roc_auc_score(true_label, predicted)
     print(test_auc)
    
          
        
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 100):
    train(train_loader)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')





In [None]:
conv1 = GCNConv(6, hidden_channels)
conv2 = GCNConv(hidden_channels, hidden_channels)
conv3 = GCNConv(hidden_channels, hidden_channels)
lin = Linear(hidden_channels+1, 2)

In [None]:
 # 1. Obtain node embeddings 
x = conv1(testd.x, testd.edge_index,testd.edge_weights)
x = x.relu()
x = conv2(x, testd.edge_index,testd.edge_weights)
x = x.relu()
x = conv3(x, testd.edge_index,testd.edge_weights)

# 2. Readout layer
x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

In [None]:
testd.x.shape

In [None]:
model(testd.x, testd.edge_index, testd.batch)

In [None]:
testtens = torch.squeeze(torch.tensor(testd.x))

In [None]:
model.double()

In [None]:
model = GCN(hidden_channels=64)

In [None]:
model.parameters

In [None]:
model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.

In [None]:
len(X_train)

In [None]:
l.sum()

In [None]:
t = fastConnectionDf(cnn_pthws)

In [None]:
t.values

In [None]:
#kegg_pathway_mask["pathway"] = kegg_pathway_mask["pathway"].str.replace(r'REACTOME_','')

In [None]:
kegg_pathway_mask["pathway"].value_counts()

In [None]:
p.set_index("TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY").join()

In [None]:
d = pd.DataFrame(l).set_index("Unnamed: 0").join(p.set_index("TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY"))

In [None]:
pd.DataFrame(l)

In [None]:
l