In [18]:
from CoxPASNet.coxpasnet.DataLoader import load_data, load_pathway
from CoxPASNet.coxpasnet.Train import trainCoxPASNet
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from CoxPASNet.coxpasnet.Survival_CostFunc_CIndex import R_set, neg_par_log_likelihood, c_index
from sksurv.metrics import concordance_index_censored
import torch.optim as optim
import time

from src.data_prep.torch_datasets import cpath_dataset
from src.models.variational_layers.variational_layer import HorseshoeLayer_out_mask
from src.models.variational_layers.linear_reparam import LinearReparam

from torch.nn.parameter import Parameter

from src.data_prep.load_data import load_cpath_data
from src.models.loss_functions.loss_functions import partial_ll_loss

from numpy.random import normal
from numpy import sin
import numpy as np



In [3]:
dtype = torch.FloatTensor
''' Net Settings'''
In_Nodes = 5567 ###number of genes
Pathway_Nodes = 860 ###number of pathways
Hidden_Nodes = 100 ###number of hidden nodes
Out_Nodes = 30 ###number of hidden nodes in the last hidden layer
''' Initialize '''
Initial_Learning_Rate = [0.03] #[0.03, 0.01, 0.001, 0.00075]
L2_Lambda = [0.01]  #[0.1, 0.01, 0.005, 0.001]
num_epochs = 10 #3000 ###for grid search
Num_EPOCHS = 15 #20000 ###for training
###sub-network setup
Dropout_Rate = [0.7,0.5]

In [4]:
''' load data and pathway '''
pathway_mask = load_pathway("../data/pathway_mask.csv", dtype)

x_train, ytime_train, yevent_train, age_train = load_data("../data/train.csv", dtype)
x_valid, ytime_valid, yevent_valid, age_valid = load_data("../data/validation.csv", dtype)
x_test, ytime_test, yevent_test, age_test = load_data("../data/test.csv", dtype)



### Testing Horseshoe Layer

In [None]:
x1 = normal(0,1,2000)
x2 = normal(0,1,2000)
x3 = normal(0,1,2000)
x4 = normal(0,1,2000)
features  = np.array([x1,x2,x3,x4])
features = np.transpose(features)
y = x1 + x2

x_df = pd.DataFrame(features, columns=['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4'])
y_df = pd.DataFrame(y, columns=["label"])

x_tens = torch.tensor(features)
y_tens = torch.tensor(y)

In [None]:
class Horseshoe_params:
    def __init__(self):
        self.horseshoe_scale = None
        self.global_cauchy_scale = 2.4
        self.weight_cauchy_scale = 15
        self.beta_rho_scale = -5.
        self.log_tau_mean = None
        self.log_tau_rho_scale = -5.
        self.bias_rho_scale = -5.
        self.log_v_mean = None
        self.log_v_rho_scale = -5.

hs_parameters = Horseshoe_params()

class hsreg(nn.Module):
    def __init__(self, In_Nodes, Pathway_Nodes,Hidden_Nodes,mask):
        super(hsreg, self).__init__()
        # activation
        self.tanh = nn.Tanh()
        # layers
        self.fc1 = HorseshoeLayer_out_mask(In_Nodes, Pathway_Nodes, hs_parameters, mask=mask)
        self.fc2 = LinearReparam(in_features=Pathway_Nodes,
                                out_features=Hidden_Nodes,
                                prior_means=np.full((Hidden_Nodes, Pathway_Nodes), 0),
                                prior_variances=np.full((Hidden_Nodes, Pathway_Nodes), 0.2),
                                posterior_mu_init=np.full((Hidden_Nodes, Pathway_Nodes), 0.5),
                                posterior_rho_init=np.full((Hidden_Nodes, Pathway_Nodes), -3.),
                                bias=False,
                                )

        # layers including kl_divergence
        self.kl_list = [self.fc1,self.fc2]

    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.fc2(x.squeeze(0),return_kl = False)
       

        return x

    def kl_divergence(self):
        KLD = 0
        for layer in self.kl_list:
            KLD += layer.kl_divergence()
        return KLD

In [None]:
i_mask = torch.ones(2,4)
#i_mask[1][0] = 0 
#i_mask[1][1] = 0 
#i_mask[0][2] = 0 
#i_mask[0][3] = 0 




i_mask

In [None]:
model = hsreg(4, 2, 1, mask=i_mask)
model.double()
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss = nn.MSELoss()


for epoch in range(1, 500):
                # measure data loading time
         
                output_ = []
                for mc_run in range(200):
                    output = model(x_tens.double())
                    output_.append(output)
                output = torch.mean(torch.stack(output_), dim=0)
                loss_crit_metric = loss(output[1],y_tens)
                scaled_kl = model.kl_divergence()  # should these things be batchsize / dataset?
                total_loss = loss_crit_metric + scaled_kl
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                model.fc1.analytic_update()
                print(total_loss)
                

In [191]:
y_pred = model(x_tens)

In [192]:
loss(y_pred,y_tens)

tensor(2.1912, dtype=torch.float64, grad_fn=<MseLossBackward0>)

In [193]:
for name, param in model.fc1.named_parameters():
    print (name,param)

beta_mean Parameter containing:
tensor([[-0.0062, -0.0049, -0.1367, -0.0019],
        [-0.0024, -0.0638, -0.1001,  0.0562]], dtype=torch.float64,
       requires_grad=True)
beta_rho Parameter containing:
tensor([[-4.0109, -4.0109, -4.0109, -4.0109],
        [-4.0109, -4.0109, -4.0109, -4.0109]], dtype=torch.float64,
       requires_grad=True)
log_tau_mean Parameter containing:
tensor([3.4030, 2.0818], dtype=torch.float64, requires_grad=True)
log_tau_rho Parameter containing:
tensor([-4.0109, -4.0109], dtype=torch.float64, requires_grad=True)
bias_mean Parameter containing:
tensor([[-5.2339e-05, -7.0357e-04]], dtype=torch.float64, requires_grad=True)
bias_rho Parameter containing:
tensor([[-4.0109, -4.0109]], dtype=torch.float64, requires_grad=True)
log_v_mean Parameter containing:
tensor([1.7385], dtype=torch.float64, requires_grad=True)
log_v_rho Parameter containing:
tensor([[-4.0109]], dtype=torch.float64, requires_grad=True)


In [77]:
y_tens

tensor([ 5.2566e-01, -9.0565e-01,  1.7676e+00, -2.9764e-01,  1.2536e+00,
         1.4750e+00,  1.5163e+00, -1.0396e+00, -6.7744e-01, -8.2130e-02,
         1.2070e+00, -3.3554e-01, -8.1102e-01, -1.7456e+00,  9.4410e-01,
         2.9364e-01, -4.2814e-01, -7.7999e-01,  1.0572e+00,  1.8253e-01,
        -6.4868e-01, -1.1367e-01,  8.8010e-01,  1.7157e+00,  2.0295e-01,
         9.9686e-01,  8.2852e-01, -1.5172e+00,  6.9478e-01,  2.7035e-01,
         1.6998e-02,  1.0618e+00,  9.3060e-02,  1.4571e+00,  1.7141e-02,
         1.6347e+00, -1.8463e+00,  1.9802e+00, -5.9827e-01, -1.4648e+00,
         6.7629e-01, -2.8161e-01, -1.3147e+00, -3.1017e-01,  1.7263e+00,
        -1.6633e+00, -1.9120e+00,  1.2116e+00,  1.6744e-01, -5.0710e-01,
        -7.0748e-01,  2.5065e-01, -7.1036e-01, -1.1587e+00,  1.1562e+00,
        -3.0361e-01,  1.5003e+00,  1.4350e+00, -3.6483e-01,  1.1510e+00,
        -3.7250e-01, -9.3640e-01,  7.0712e-01,  6.1646e-01,  9.3286e-01,
        -4.8368e-01, -1.4867e-02,  7.2249e-01,  4.7

In [2]:
cpath_train_loader,cpath_test_loader,cpath_val_loader,pathway_mask = load_cpath_data(cuda = False)

In [10]:
class Horseshoe_params:
    def __init__(self):
        self.horseshoe_scale = None
        self.global_cauchy_scale = 1.
        self.weight_cauchy_scale = 1.
        self.beta_rho_scale = -5.
        self.log_tau_mean = None
        self.log_tau_rho_scale = -5.
        self.bias_rho_scale = -5.
        self.log_v_mean = None
        self.log_v_rho_scale = -5.

hs_parameters = Horseshoe_params()

class hs_nn(nn.Module):
    def __init__(self, In_Nodes, Pathway_Nodes,Hidden_Nodes,mask):
        super(hs_nn, self).__init__()
        # activation
        self.tanh = nn.Tanh()
        # layers
        self.fc1 = HorseshoeLayer_out_mask(In_Nodes, Pathway_Nodes, hs_parameters, mask=mask)
        self.fc2 = LinearReparam(in_features=Pathway_Nodes,
                                out_features=Hidden_Nodes,
                                prior_means=np.full((Hidden_Nodes, Pathway_Nodes), 0),
                                prior_variances=np.full((Hidden_Nodes, Pathway_Nodes), 0.2),
                                posterior_mu_init=np.full((Hidden_Nodes, Pathway_Nodes), 0.5),
                                posterior_rho_init=np.full((Hidden_Nodes, Pathway_Nodes), -3.),
                                bias=False,
                                )

        # layers including kl_divergence
        self.kl_list = [self.fc1]

    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.fc2(x.squeeze(0),return_kl = False)
       

        return x

    def kl_divergence(self):
        KLD = 0
        for layer in self.kl_list:
            KLD += layer.kl_divergence()
        return KLD

In [43]:
pathway_mask.shape

torch.Size([860, 5567])

In [11]:

model = hs_nn(5567, 860, 1, mask=pathway_mask)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for name, param in model.named_parameters():
    print (name)

fc1.beta_mean
fc1.beta_rho
fc1.log_tau_mean
fc1.log_tau_rho
fc1.bias_mean
fc1.bias_rho
fc1.log_v_mean
fc1.log_v_rho
fc2.mu_weight
fc2.rho_weight


In [12]:


for epoch in range(1, 3):
    for i, (input, target) in enumerate(cpath_train_loader):
                # measure data loading time


                tb = target["tb"].cpu()
                e = target["e"].cpu()
                input_var = input["X"].cpu()
                clinical_var = input["clinical_vars"].cpu()

                output_ = []
                for mc_run in range(100):
                    output = model(input_var)
                    output_.append(output)
                output = torch.mean(torch.stack(output_), dim=0)
                loss_crit_metric = partial_ll_loss(output.reshape(-1).cpu(), tb.reshape(-1).cpu(), e.reshape(-1).cpu())
                scaled_loss_crit_metric = loss_crit_metric * (len(cpath_train_loader.dataset) / cpath_train_loader.batch_size)  # this might be the other way round
                scaled_kl = model.kl_divergence() / cpath_train_loader.batch_size  # should these things be batchsize / dataset?
                #loss = scaled_loss_crit_metric + scaled_kl
                loss = scaled_kl
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                model.fc1.analytic_update()

                conc_metric = concordance_index_censored(e.detach().cpu().numpy().astype(bool).reshape(-1),
                                                         tb.detach().cpu().numpy().reshape(-1),
                                                         output.reshape(-1).detach().cpu().numpy())[0]
                print(conc_metric)

0.4754662282030202
0.4870624378001817


In [32]:
output[1]

tensor(4.2190, dtype=torch.float64, grad_fn=<MeanBackward0>)

### Testing Group Dropout

In [21]:
class LinearGroupNJ_Masked(Module):
    """Fully Connected Group Normal-Jeffrey's layer (aka Group Variational Dropout).
    References:
    [1] Kingma, Diederik P., Tim Salimans, and Max Welling. "Variational dropout and the local reparameterization trick." NIPS (2015).
    [2] Molchanov, Dmitry, Arsenii Ashukha, and Dmitry Vetrov. "Variational Dropout Sparsifies Deep Neural Networks." ICML (2017).
    [3] Louizos, Christos, Karen Ullrich, and Max Welling. "Bayesian Compression for Deep Learning." NIPS (2017).
    """

    def __init__(self, in_features, out_features, cuda=False, init_weight=None, init_bias=None, clip_var=None):

        super(LinearGroupNJ, self).__init__()
        self.cuda = cuda
        self.in_features = in_features
        self.out_features = out_features
        self.clip_var = clip_var
        self.deterministic = False  # flag is used for compressed inference
        # trainable params according to Eq.(6)
        # dropout params
        self.z_mu = Parameter(torch.Tensor(in_features))
        self.z_logvar = Parameter(torch.Tensor(in_features))  # = z_mu^2 * alpha
        # weight params
        self.weight_mu = Parameter(torch.Tensor(out_features, in_features))
        self.weight_logvar = Parameter(torch.Tensor(out_features, in_features))

        self.bias_mu = Parameter(torch.Tensor(out_features))
        self.bias_logvar = Parameter(torch.Tensor(out_features))

        # init params either random or with pretrained net
        self.reset_parameters(init_weight, init_bias)

        # activations for kl
        self.sigmoid = nn.Sigmoid()
        self.softplus = nn.Softplus()

        # numerical stability param
        self.epsilon = 1e-8

    def reset_parameters(self, init_weight, init_bias):
        # init means
        stdv = 1. / math.sqrt(self.weight_mu.size(1))

        self.z_mu.data.normal_(1, 1e-2)

        if init_weight is not None:
            self.weight_mu.data = self.mask * torch.Tensor(init_weight)
        else:
            self.weight_mu.data.normal_(0, stdv)
            self.weight_mu.data = self.mask * self.weight_mu 

        if init_bias is not None:
            self.bias_mu.data = torch.Tensor(init_bias)
        else:
            self.bias_mu.data.fill_(0)

        # init logvars
        self.z_logvar.data.normal_(-9, 1e-2)
        self.weight_logvar.data.normal_(-9, 1e-2)
        self.bias_logvar.data.normal_(-9, 1e-2)

    def clip_variances(self):
        if self.clip_var:
            self.weight_logvar.data.clamp_(max=math.log(self.clip_var))
            self.bias_logvar.data.clamp_(max=math.log(self.clip_var))

    def get_log_dropout_rates(self):
        log_alpha = self.z_logvar - torch.log(self.z_mu.pow(2) + self.epsilon)
        return log_alpha

    def compute_posterior_params(self):
        weight_var, z_var = self.weight_logvar.exp(), self.z_logvar.exp()
        self.post_weight_var = self.z_mu.pow(2) * weight_var + z_var * self.weight_mu.pow(2) + z_var * weight_var
        self.post_weight_mu = self.weight_mu * self.z_mu
        return self.post_weight_mu, self.post_weight_var

    def forward(self, x):
        if self.deterministic:
            assert self.training == False, "Flag deterministic is True. This should not be used in training."
            return F.linear(x, self.post_weight_mu, self.bias_mu)

        batch_size = x.size()[0]
        # compute z  
        # note that we reparametrise according to [2] Eq. (11) (not [1])
        z = reparametrize(self.z_mu.repeat(batch_size, 1), self.z_logvar.repeat(batch_size, 1), sampling=self.training,
                          cuda=self.cuda)

        # apply local reparametrisation trick see [1] Eq. (6)
        # to the parametrisation given in [3] Eq. (6)
        xz = x * z
        mu_activations = F.linear(xz, self.weight_mu, self.bias_mu)
        var_activations = F.linear(xz.pow(2), self.weight_logvar.exp(), self.bias_logvar.exp())

        return reparametrize(mu_activations, var_activations.log(), sampling=self.training, cuda=self.cuda)

    def kl_divergence(self):
        # KL(q(z)||p(z))
        # we use the kl divergence approximation given by [2] Eq.(14)
        k1, k2, k3 = 0.63576, 1.87320, 1.48695
        log_alpha = self.get_log_dropout_rates()
        KLD = -torch.sum(k1 * self.sigmoid(k2 + k3 * log_alpha) - 0.5 * self.softplus(-log_alpha) - k1)

        # KL(q(w|z)||p(w|z))
        # we use the kl divergence given by [3] Eq.(8)
        KLD_element = -0.5 * self.weight_logvar + 0.5 * (self.weight_logvar.exp() + self.weight_mu.pow(2)) - 0.5
        KLD += torch.sum(KLD_element)

        # KL bias
        KLD_element = -0.5 * self.bias_logvar + 0.5 * (self.bias_logvar.exp() + self.bias_mu.pow(2)) - 0.5
        KLD += torch.sum(KLD_element)

        return KLD

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [53]:
test_tens = Parameter(torch.Tensor(860, 5567))
test_tens.data.normal_(0, 0.2)

tensor([[-0.3019,  0.3189, -0.2703,  ..., -0.0904, -0.2369, -0.0382],
        [-0.1145,  0.3655,  0.1703,  ..., -0.1354,  0.2717, -0.0121],
        [-0.1308,  0.1574,  0.1233,  ...,  0.0948, -0.1653,  0.0116],
        ...,
        [ 0.2184,  0.3030,  0.0014,  ..., -0.0902,  0.0674,  0.0728],
        [-0.2955,  0.2499, -0.2631,  ...,  0.2403,  0.0836, -0.0193],
        [-0.1089, -0.2964,  0.0898,  ..., -0.1847,  0.1355,  0.1329]])

In [55]:
test_tens.shape

torch.Size([860, 5567])

In [43]:
)

RuntimeError: The size of tensor a (860) must match the size of tensor b (5567) at non-singleton dimension 1

In [45]:
l = pathway_mask.transpose(0,1)

In [49]:
l*test_tens

RuntimeError: The size of tensor a (860) must match the size of tensor b (5567) at non-singleton dimension 1

In [56]:
w = pathway_mask *test_tens

In [50]:
l.shape

torch.Size([5567, 860])

In [51]:
pathway_mask.shape

torch.Size([860, 5567])