In [2]:
import os
if "x_perceiver" not in os.listdir():
    os.chdir("/home/kh701/pycharm/healnet/")
import torch
from torch import nn
import multiprocessing
import torchvision
import numpy as np
import torchvision.transforms as transforms
import einops
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from healnet.models.explainer import Explainer
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

from healnet.utils import Config, flatten_config
from healnet.etl import TCGADataset
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

    
%reload_ext autoreload
%autoreload 2

## Import data

In [3]:
# get dataloaders
config = Config("config/main_gpu.yml").read()
config = flatten_config(config) # TODO - refactor to other 

blca = TCGADataset(
    dataset="blca", 
    config=config, 
    level=2, 
    sources=["omic"]
)

brca = TCGADataset(
    dataset="brca", 
    config=config, 
    level=2, 
    sources=["omic"]
)



Filled 0 missing values with mean
Missing values per feature: 
 Series([], dtype: int64)
Slides available: 436
Omic available: 437
Overlap: 436
Filtering out 1 samples for which there are no omic data available
Dataloader initialised for blca dataset
Dataset: BLCA
Molecular data shape: (436, 2191)
Molecular/Slide match: 436/436
Slide level count: 4
Slide level dimensions: ((79968, 79653), (19992, 19913), (4998, 4978), (2499, 2489))
Slide resize dimensions: w: 1024, h: 1024
Sources selected: ['omic']
Censored share: 0.539
Survival_bin_sizes: {0: 72, 1: 83, 2: 109, 3: 172}
Filled 0 missing values with mean
Missing values per feature: 
 Series([], dtype: int64)
Slides available: 1019
Omic available: 1022
Overlap: 1019
Filtering out 3 samples for which there are no omic data available
Dataloader initialised for brca dataset
Dataset: BRCA
Molecular data shape: (1019, 2922)
Molecular/Slide match: 1019/1019
Slide level count: 3
Slide level dimensions: ((35855, 34985), (8963, 8746), (2240, 218

In [4]:
# get tabular data
blca_loader = DataLoader(
    blca, 
    batch_size=1, 
    shuffle=True, 
    num_workers=multiprocessing.cpu_count()-1
)
[sample], censorship, event_time, y_disc = next(iter(blca_loader))

In [5]:
sample.shape

torch.Size([1, 1, 2183])

## Tabular self-supervised pre-training

To start with, we want to build and encoder-decoder model which trains a cross-attention unit as the encoder, which can later on be deployed in the iterative model. We then want to benchmark the performance with pan-cancer pre-training vs. without pre-training. 

In [6]:
from healnet.models.healnet import Attention, PreNorm

class AttentionEncoder(nn.Module): 
    """
    Simple encoder that uses fourier encoding, pre-norm and cross-attention to encode the input features into a latent array 
    of size (num_latents x latent_dim). Takes in both the input tensors as well as a randomly initialised latent 
    array as the input. 
    """
    def __init__(self, 
                 input_channels: int,
                 latent: torch.Tensor, 
                 input_axis: int = 1, 
                 attn_dropout: float = 0.1,
                 num_heads: int = 4, 
                 num_freq_bands: int=8, 
                 ):    
        super().__init__()
        
        self.input_channels = input_channels
        self.input_axis = input_axis
        self.attn_dropout = attn_dropout
        self.num_heads = num_heads
        
        
        # fourier_channels = (input_axis * ((num_freq_bands * 2) + 1))
        # input_dim = fourier_channels + input_channels
        input_dim = input_channels
                
        latent_dim = latent.shape[-1] # required for PreNorm layer
        # simple single attention unit
        enc = PreNorm(latent_dim, Attention(latent_dim, input_dim, heads=num_heads, dim_head=num_heads, dropout=attn_dropout), context_dim=input_dim)
        
        self.layers = nn.ModuleList([enc])
        
    def forward(self, latent: torch.Tensor, context: torch.Tensor):
        """
        Note: context is the data, x is the latent
        Args:
            latent: 
            context: 

        Returns:

        """
        for layer in self.layers:
            latent = layer(x=latent, context=context)
        return latent


The decoder often needs to be different depending on the modality, so let's implement modality-specific decoders while trying to have a relatively general-purpose encoder that we can plug into the pipeline.

Note that we may change this later down the line. 

In [40]:
class TabularDecoder(nn.Module):
    """
    Decoder suited for tabular data. We use the following: 
    - Skip connections: faster and more stable training
    - Batch normalisation: stabilises the activations and speeds up training
    - Activation: Output layer to map back to output dimensions, corresponding to the original data dims
    Tries to reconstruct the original input given the latent
    """
    def __init__(self, latent_dim: int, num_latents: int, output_dim: int, method: str = "dense"):
        super(TabularDecoder, self).__init__()
        assert method in ["dense", "conv"], "Decoder type not recognised"
        layers = []
        
        if method == "dense": 
            
            # flatten latent array (batch, num_latents, latent_dim) -> (batch, num_latents * latent_dim)
            layers.extend([nn.Flatten()]) 
            out_dims = [256, 512, 1024] # may refactor as hyperparameter later
            
            in_dim = latent_dim * num_latents
            for idx, out_dim in enumerate(out_dims):
                
                layers.extend([
                    nn.Linear(in_features=in_dim, out_features=out_dim), 
                    nn.LeakyReLU(), 
                    # nn.BatchNorm1d(hidden_dim, track_running_stats=True), 
                    nn.Dropout(0.5)
                ])
                
                in_dim = out_dim # update for next layer
            
            # final layer to reconstruct output
            layers.append(nn.Linear(in_dim, output_dim))
        
        self.decode = nn.Sequential(*layers)
        print(self.decode)
        
    def forward(self, latent: torch.Tensor):
        return self.decode(latent)
    
    
        
    

Finally, putting it all together in the encoder-decoder model


In [42]:
from typing import *

class TabPretrainer(nn.Module): 
    """
    Encoder-decoder model for pre-training tabular data.
    # TODO - refactor abstract base class for initialisations 
    """
    def __init__(self,
                 sample: torch.Tensor,
                 # input_channels: int,
                 latent_shape: List[int],
                 input_axis: int = 1,
                 attn_dropout: float = 0.1,
                 num_heads: int = 4,
                 num_freq_bands: int=8,
                 ):
        super().__init__()
        self.input_channels = sample.shape[-1]
        self.input_axis = input_axis
        self.num_latents, self.latent_dim = latent_shape
        self.attn_dropout = attn_dropout
        self.num_heads = num_heads
        self.num_freq_bands = num_freq_bands
        
        
        # randomly initialise latent
        self.latent = nn.Parameter(torch.randn(self.num_latents, self.latent_dim))
        
        # encoder
        self.encoder = AttentionEncoder(
            input_channels=self.input_channels, 
            latent=self.latent, 
            input_axis=self.input_axis, 
            attn_dropout=attn_dropout, 
            num_heads=num_heads, 
            num_freq_bands=num_freq_bands
        )
        
        # decoder
        self.decoder = TabularDecoder(
            latent_dim=self.latent_dim, 
            num_latents=self.num_latents,
            output_dim=self.input_channels, 
            method="dense"
        )
        
    def forward(self, x: torch.Tensor):
        # get batch dim
        b = x.shape[0]
        
        # expand latent to batch size
        if len(self.latent.shape) == 2:
            self.latent = nn.Parameter(einops.repeat(self.latent, "n d -> b n d", b=b))
        
        # encode
        self.latent.data = self.encoder(latent=self.latent, context=x).data + self.latent.data
        # print(self.latent.shape)
        # decode, reconstructed x
        rec_x = self.decoder(self.latent)
        return rec_x
    
    def get_latent(self):
        return self.latent

Next, we need to think about tabular loss functions. Here, we can explore both reconstruction losses and contrastive losses. 

In [19]:
class TabularLoss(nn.Module):
    """
    Reconstruction loss functions for tabular data. We use two types which are commonly used with continuous data: 
    - Mean squared error
    - Constrastive loss, measured as cosine distance between the original and reconstructed data
    We seek to minimise both objectives.
    """
    def __init__(self,
                 method: str = "mse",
                 reduction: str = "mean",
                 ):
        super().__init__()
        assert method in ["mse", "contrastive"], "Loss type not recognised"
        self.loss_type = method
        self.reduction = reduction
        
        if method == "mse":
            self.loss = nn.MSELoss(reduction=reduction)
        elif method == "contrastive":
            self.loss = nn.CosineEmbeddingLoss(reduction=reduction)
            
    def __call__(self, **kwargs):
        return self.loss(**kwargs)
    

Finally, we write a pre-training loop that we can use for pre-training across cancer sites. 

In [10]:
# get overlap between omic columns
col1 = blca.omic_df.columns
col2 = brca.omic_df.columns
print(len(col1), len(col2), len(set(col1).intersection(col2)))

2191 2922 1758


In [11]:
blca.features

# types of features
# continuous: *_rnaseq, age
# categorical: *_mut, *_cnv 

Unnamed: 0,age,is_female,AAK1_rnaseq,AATK_rnaseq,ABCB1_rnaseq,ABCG2_rnaseq,ABI1_rnaseq,ABL1_rnaseq,ABL2_rnaseq,ACE_rnaseq,ACKR1_rnaseq,ACKR3_rnaseq,ACSL3_rnaseq,ACSL6_rnaseq,ACVR1B_rnaseq,ACVR1C_rnaseq,ACVR1_rnaseq,ACVR2A_rnaseq,ACVR2B_rnaseq,ACVRL1_rnaseq,ADAM10_rnaseq,ADAM17_rnaseq,ADCK1_rnaseq,ADCK2_rnaseq,ADCK5_rnaseq,...,UTRN_mut,VCAN_mut,VPS13B_mut,VPS13C_mut,VPS13D_mut,WDFY3_mut,WNK1_mut,XIRP2_mut,XIST_mut,ZDBF2_mut,ZFHX3_mut,ZFHX4_mut,ZFP36L1_mut,ZFYVE26_mut,ZFYVE9_mut,ZNF236_mut,ZNF292_mut,ZNF423_mut,ZNF521_mut,ZNF536_mut,ZNF626_mut,ZNF804A_mut,ZNF91_mut,ZZEF1_mut,RAS_mut
0,63,0,-0.6734,-0.4660,0.8401,-0.2222,2.2318,-0.8171,0.8051,-0.1250,-0.2976,1.2538,-0.3237,-0.1429,0.5258,-0.0748,-0.2048,-0.3004,0.2998,-0.6414,1.2149,1.1643,0.3720,-0.2883,-0.1974,...,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
1,66,0,2.4277,-0.3853,0.1104,-0.2183,-0.0952,-0.6255,0.0970,-0.4911,-0.1779,-0.4134,-0.1501,-0.1576,-0.3597,0.4555,-1.0758,0.3252,1.7109,-0.5763,2.5860,1.5608,-0.6966,0.1801,-0.3164,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,66,0,2.4277,-0.3853,0.1104,-0.2183,-0.0952,-0.6255,0.0970,-0.4911,-0.1779,-0.4134,-0.1501,-0.1576,-0.3597,0.4555,-1.0758,0.3252,1.7109,-0.5763,2.5860,1.5608,-0.6966,0.1801,-0.3164,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,69,0,1.1340,-0.4110,0.1572,0.0752,0.0566,-1.3448,-0.3876,1.0335,-0.3683,-0.3736,-0.3294,-0.1807,0.8215,-0.7729,-0.7901,-0.9142,-0.2716,-0.2526,1.2477,0.8202,-0.1294,0.7846,-0.2564,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,59,1,-0.5311,0.1418,-0.0998,-0.2493,-0.6956,-0.3696,-0.1672,-0.7257,-0.3450,-0.1465,0.2727,0.3077,1.3352,2.3315,0.2386,1.6382,-0.2124,-0.7441,0.9661,0.6493,-1.2289,-0.0261,-0.1046,...,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,71,0,2.8284,0.9219,-0.4711,-0.1561,-0.7569,-0.0186,2.2843,1.9383,-0.3507,-0.4448,-0.4321,-0.1794,-1.0555,-0.6594,1.4145,-1.0607,-0.2349,0.9236,-0.8023,1.6165,-1.0423,-1.2719,-0.7105,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
433,61,1,0.9422,0.2662,-0.5276,-0.3078,-0.5164,-0.3653,1.6644,0.4936,-0.3722,-0.5500,-0.3539,-0.1324,-1.1019,-0.7735,0.0458,-1.1459,-0.6342,0.4675,-1.0416,-0.9597,-1.3496,0.4849,1.6007,...,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
434,60,1,-0.3000,-0.5301,-0.5559,0.4720,0.0597,-0.1817,3.6724,0.1842,-0.3892,-0.6284,0.2315,-0.1468,-1.1972,-0.5964,-0.2927,-0.9101,-0.6431,-0.3869,-0.5002,-0.0913,-0.6892,-0.6854,1.1884,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
435,62,1,3.2208,-0.2592,-0.8130,-0.1423,-1.2421,-1.4423,-0.6631,-0.9650,-0.3868,1.9972,0.0008,-0.1583,-1.4766,-0.7790,-1.2992,-1.0639,0.5145,-0.8358,-0.1595,-0.0378,-0.3054,0.7819,1.9629,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1


In [None]:
from tqdm import tqdm

torch.set_printoptions(sci_mode=False)


def pretrain_loop(
        data: TCGADataset,
        batch_size: int, 
    ):
    
    loader = DataLoader(
        data, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=multiprocessing.cpu_count()-1
    )
    [omic_sample], _, _, _ = next(iter(loader))   
    
    
    model = TabPretrainer(
        sample = omic_sample,
        # input_channels=omic_sample.shape[-1], 
        input_axis=1, 
        latent_shape=[256, 32], 
        attn_dropout=0.1, 
        num_heads=4,
        num_freq_bands=8
    )
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    loss_method = "mse"
    loss_fn = TabularLoss(method=loss_method)
    
    for epoch in tqdm(range(10)):
        for idx, batch in enumerate(loader):
            [omic], censorship, event_time, y_disc = batch
            rec_omic = model(omic)
            # print(omic.shape)
            if loss_method == "contrastive":
                # need to pass in larges for contrastive loss
                # using torch.ones to ensure that omic and rec_omic are learned as similar representations
                # note that this is a slight repurposing of the contrastive loss function
                # with this, the loss is just 1-cos(omic, rec_omic)
                loss = loss_fn(input1=omic, input2=rec_omic, target=torch.ones(omic.shape[0]))
            elif loss_method == "mse": 
                loss = loss_fn(input=omic, target=rec_omic)
            # loss = loss_fn(omic, rec_omic)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            # print every 10th batch
            if idx % 100 == 0:
                print(loss)
                # print(omic)
                # print(rec_omic)
            
    
pretrain_loop(
    data=blca, 
    batch_size=1)
    

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=8192, out_features=256, bias=True)
  (2): LeakyReLU(negative_slope=0.01)
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=256, out_features=512, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Dropout(p=0.5, inplace=False)
  (7): Linear(in_features=512, out_features=1024, bias=True)
  (8): LeakyReLU(negative_slope=0.01)
  (9): Dropout(p=0.5, inplace=False)
  (10): Linear(in_features=1024, out_features=2183, bias=True)
)


  0%|          | 0/10 [00:00<?, ?it/s]

tensor(1.8431, grad_fn=<MseLossBackward0>)
tensor(2.5584, grad_fn=<MseLossBackward0>)
tensor(2.7047, grad_fn=<MseLossBackward0>)
tensor(2.8956, grad_fn=<MseLossBackward0>)
tensor(5.7764, grad_fn=<MseLossBackward0>)


 10%|█         | 1/10 [00:07<01:09,  7.69s/it]

tensor(7.2884, grad_fn=<MseLossBackward0>)
tensor(20.0209, grad_fn=<MseLossBackward0>)
tensor(2.6867, grad_fn=<MseLossBackward0>)
tensor(2.2701, grad_fn=<MseLossBackward0>)
tensor(1364.9028, grad_fn=<MseLossBackward0>)


 20%|██        | 2/10 [00:15<01:00,  7.57s/it]

tensor(1064.7476, grad_fn=<MseLossBackward0>)
tensor(84.7511, grad_fn=<MseLossBackward0>)
tensor(36.5887, grad_fn=<MseLossBackward0>)
tensor(35.9273, grad_fn=<MseLossBackward0>)
tensor(40.3912, grad_fn=<MseLossBackward0>)


 30%|███       | 3/10 [00:22<00:53,  7.68s/it]

tensor(38.5193, grad_fn=<MseLossBackward0>)
tensor(34.1691, grad_fn=<MseLossBackward0>)
tensor(46.3456, grad_fn=<MseLossBackward0>)
tensor(27.7616, grad_fn=<MseLossBackward0>)
tensor(42.9793, grad_fn=<MseLossBackward0>)


 40%|████      | 4/10 [00:30<00:46,  7.68s/it]

tensor(23.9020, grad_fn=<MseLossBackward0>)
tensor(955563.5625, grad_fn=<MseLossBackward0>)
tensor(5759.6733, grad_fn=<MseLossBackward0>)
tensor(8378.5557, grad_fn=<MseLossBackward0>)
tensor(2826.3430, grad_fn=<MseLossBackward0>)


 50%|█████     | 5/10 [00:38<00:38,  7.67s/it]

tensor(3253.9719, grad_fn=<MseLossBackward0>)
