In [1]:
%%capture
!pip install scprep
!pip install anndata
!pip install scanpy

In [2]:
import numpy as np
import pandas as pd
import anndata
import scprep
import scanpy as sc
import sklearn
from sklearn.model_selection import train_test_split
import tempfile
import os
import sys
import scipy
from scipy import sparse

import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

import load_raw
import normalize_tools as nm
import metrics

In [201]:
#set up all hyper-parameters
hyper = {
    "nEpochs":60,
    "dimRNA":3633,
    "dimATAC":4403,
    "n_hidden":1024,
    "layer_sizes":[1024, 1024, 1024, 256, 256],
    "nz":128,
    "batchSize":128,
    "lr":1e-3,
    "lamb_kl":1e-9,
    "lamb_anc":1e-9,
    "clip_grad":0.1,
    "weightDirName": './checkpoint/',
}

In [189]:
torch.cuda.is_available()

True

# **try out with scicar cell lines dataset**

**1. URLs for raw data**

In [5]:
rna_data, atac_data, rna_cells, atac_cells, rna_genes, atac_genes = load_raw.load_raw_cell_lines()

In [41]:
scicar_data, joint_index, keep_cells_idx = load_raw.merge_data(rna_data, atac_data, rna_cells, atac_cells, rna_genes, atac_genes)
#rna_df, atac_df = ann2df(scica|r_data)|

In [42]:
#tryout log cpm scicar_data
nm.log_cpm(scicar_data)
nm.log_cpm(scicar_data, obsm = "mode2", obs = "mode2_obs", var = "mode2_var")
nm.hvg_by_sc(scicar_data, proportion = 0.06)
nm.hvg_by_sc(scicar_data, obsm = "mode2", obs = "mode2_obs", 
             var = "mode2_var", proportion = 0.03)

In [43]:
scicar_data.uns["mode2_obs"] = np.array(scicar_data.uns["mode2_obs"][0])

In [44]:
scicar_data.uns["mode2_var"] = np.array(scicar_data.uns["mode2_var"][0])

In [45]:
scicar_data.uns = {"mode2_obs": scicar_data.uns["mode2_obs"], "mode2_var": scicar_data.uns["mode2_var"]}

In [46]:
train_data, test_data = load_raw.train_test_split(scicar_data)

In [47]:
test_data.X

<1422x3633 sparse matrix of type '<class 'numpy.float32'>'
	with 68399 stored elements in Compressed Sparse Row format>

In [48]:
train_data.obsm["mode2"]

<3317x4403 sparse matrix of type '<class 'numpy.float32'>'
	with 15934 stored elements in Compressed Sparse Row format>

# **define pytorch datasets for RNA and ATAC**

In [52]:
class Merge_Dataset(Dataset):
    def __init__(self, adata):
        self.rna_data, self.atac_data = self._load_merge_data(adata)

    def __len__(self):
        #assert(len(self.rna_data) == len(self.atac_data))
        return len(self.atac_data)
  
    def __getitem__(self, idx):
        rna_sample = self.rna_data.values[idx]
        atac_sample = self.atac_data.values[idx]
        #return a tensor that for a single observation
        return {"rna_tensor": torch.from_numpy(rna_sample).float(), "atac_tensor": torch.from_numpy(atac_sample).float()}
  
    def _load_merge_data(self, adata):
        rna_df = pd.DataFrame(data = adata.X.toarray(), index = np.array(adata.obs.index), columns = np.array(adata.var.index))
        atac_df = pd.DataFrame(data = adata.obsm["mode2"].toarray(), index = np.array(adata.uns["mode2_obs"]), columns = np.array(adata.uns["mode2_var"]))
        return rna_df, atac_df

# **define basic models(autoencoders) for learning latent space**

In [202]:
class FC_VAE(nn.Module):
    def __init__(self, n_input, nz, n_hidden=hyper["n_hidden"], layer_sizes=hyper["layer_sizes"]):
        super(FC_VAE, self).__init__()
        self.n_input = n_input
        self.nz = nz
        self.n_hidden = n_hidden
        self.layer_sizes = layer_sizes

        self.encoder_layers = []

        self.encoder_layers.append(nn.Linear(n_input, self.layer_sizes[0]))
        self.encoder_layers.append(nn.LeakyReLU(inplace=True))
        self.encoder_layers.append(nn.BatchNorm1d(self.layer_sizes[0]))

        for layer_idx in range(len(layer_sizes)-1):
            if layer_idx == len(layer_sizes) - 2:
                self.encoder_layers.append(nn.Linear(self.layer_sizes[layer_idx], self.layer_sizes[layer_idx+1]))
            else:
                self.encoder_layers.append(nn.Linear(self.layer_sizes[layer_idx], self.layer_sizes[layer_idx+1]))
                self.encoder_layers.append(nn.BatchNorm1d(self.layer_sizes[layer_idx+1]))
                self.encoder_layers.append(nn.LeakyReLU(inplace=True))

        self.encoder = nn.Sequential(
            *self.encoder_layers
        )
        self.fc1 = nn.Linear(self.layer_sizes[-1], nz)
        self.fc2 = nn.Linear(self.layer_sizes[-1], nz)

        self.decoder_layers = []
        self.decoder_layers.append(nn.Linear(nz, self.layer_sizes[-1]))
        self.decoder_layers.append(nn.LeakyReLU(inplace=True))
        self.decoder_layers.append(nn.BatchNorm1d(self.layer_sizes[-1]))

        for layer_idx in range(len(self.layer_sizes)-1, 0, -1):
            self.decoder_layers.append(nn.Linear(self.layer_sizes[layer_idx], self.layer_sizes[layer_idx-1]))
            self.decoder_layers.append(nn.LeakyReLU(inplace=True))
            self.decoder_layers.append(nn.BatchNorm1d(self.layer_sizes[layer_idx-1]))

        self.decoder_layers.append(nn.Linear(self.layer_sizes[0], self.n_input))

        self.decoder = nn.Sequential(
            *self.decoder_layers
        )
    def encode(self, x):
        h = self.encoder(x)
        return self.fc1(h), self.fc2(h)

    def reparametrize(self, mu, logvar):
        #calculate std from log(var)
        std = logvar.mul(0.5).exp_()
        if torch.cuda.is_available():
            eps = torch.cuda.FloatTensor(std.size()).normal_()
        else:
            eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps)
        return eps.mul(std).add_(mu)
    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparametrize(mu, logvar)
        res = self.decode(z)
        return res, z, mu, logvar

    def get_latent_var(self, x):
        mu, logvar = self.encode(x)
        z = self.reparametrize(mu, logvar)
        return z
    
    def generate(self, z):
        return self.decode(z)

# **train VAE model based on reconstruction, KL divergence, and anchor loss**

In [203]:
#load dataset and split train and test data
def get_data_loaders(train_data, test_data):
    train_set = Merge_Dataset(train_data)
    test_set = Merge_Dataset(test_data)
    #load data loader
    train_loader = DataLoader(train_set, batch_size=hyper["batchSize"], drop_last=False, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=test_data.shape[0], drop_last=False, shuffle=False)
    return train_loader, test_loader

train_loader, test_loader = get_data_loaders(train_data=train_data, test_data=sub_test_data)

In [204]:
#load basic models
netRNA = FC_VAE(n_input=hyper["dimRNA"], nz=hyper["nz"], layer_sizes=hyper["layer_sizes"])
netATAC = FC_VAE(n_input=hyper["dimATAC"], nz=hyper["nz"], layer_sizes=hyper["layer_sizes"])

In [205]:
#use GPU
if torch.cuda.is_available():
    print("using GPU")
    netRNA.cuda()
    netATAC.cuda()
    device = "cuda"
else:
    device = "cpu"
#setup optimizers for two nets
opt_netRNA = optim.Adam(list(netRNA.parameters()), lr=hyper["lr"])
opt_netATAC = optim.Adam(list(netATAC.parameters()), lr=hyper["lr"])

using GPU


In [206]:
#set up loss function
def basic_loss(recon_x, x, mu, logvar, lamb1):
    MSE = nn.MSELoss()
    lloss = MSE(recon_x, x)
    #KL divergence
    KL_loss = -0.5*torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    lloss = lloss + lamb1*KL_loss
    return lloss

#anchor loss for minimizing distance between paired observation
def anchor_loss(embed_rna, embed_atac):
    L1 = nn.L1Loss()
    anc_loss = L1(embed_rna, embed_atac)
    return anc_loss

In [207]:
def knn_criteria(rna_inputs, atac_inputs, rna_outputs, atac_outputs, proportion_neighbors=0.1, n_svd=100):
    n_svd = min([n_svd, min(rna_inputs.shape)-1])
    n_neighbors = int(np.ceil(proportion_neighbors*rna_inputs.shape[0]))
    X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(rna_inputs)
    _, indices_true = (
        sklearn.neighbors.NearestNeighbors(n_neighbors = n_neighbors).fit(rna_inputs).kneighbors(rna_inputs)
    )
    _, indices_pred = (
        sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors).fit(rna_outputs).kneighbors(atac_outputs)
    )
    neighbors_match = np.zeros(n_neighbors, dtype=int)
    for i in range(rna_inputs.shape[0]):
        _, pred_matches, true_matches = np.intersect1d(
            indices_pred[i], indices_true[i], return_indices=True
        )
        neighbors_match_idx = np.maximum(pred_matches, true_matches)
        neighbors_match += np.sum(np.arange(n_neighbors) >= neighbors_match_idx[:, None], axis = 0,)
    neighbors_match_curve = neighbors_match/(np.arange(1, n_neighbors + 1) * rna_inputs.shape[0])
    area_under_curve = np.mean(neighbors_match_curve)
    return area_under_curve

In [208]:
#set up train functions
def train(epoch):
    netRNA.train()
    netATAC.train()
    train_losses = []
    for idx, samples in enumerate(train_loader):
        rna_inputs, atac_inputs = Variable(samples["rna_tensor"]), Variable(samples["atac_tensor"])
        if torch.cuda.is_available():
            rna_inputs = rna_inputs.cuda()
            atac_inputs = atac_inputs.cuda()
            
        opt_netATAC.zero_grad()
        opt_netRNA.zero_grad()
        recon_rna, z_rna, mu_rna, logvar_rna = netRNA(rna_inputs)
        recon_atac, z_atac, mu_atac, logvar_atac = netATAC(atac_inputs)
        rna_loss = basic_loss(recon_rna, rna_inputs, mu_rna, logvar_rna, lamb1=hyper["lamb_kl"])
        atac_loss = basic_loss(recon_atac, atac_inputs, mu_atac, logvar_atac, lamb1=hyper["lamb_kl"])
        anc_loss = anchor_loss(z_rna, z_atac)
        
        #loss functions for each modalities
        train_loss = rna_loss + atac_loss + hyper["lamb_anc"] * anc_loss
        train_loss.backward()
        nn.utils.clip_grad_norm_(netRNA.parameters(), max_norm=hyper["clip_grad"])
        nn.utils.clip_grad_norm_(netATAC.parameters(), max_norm=hyper["clip_grad"])
        opt_netRNA.step()
        opt_netATAC.step()
        train_losses.append(train_loss.item())
    if epoch % 20 == 0:
        print("Epoch: " + str(epoch) + ", train loss: " + str(np.mean(train_losses)))

In [209]:
def evaluate(epoch):
    #evaluating step
    with torch.no_grad():
        netRNA.eval()
        netATAC.eval()
        knn_acc = []
        #mse_acc = []
        for idx, samples in enumerate(test_loader):
            rna_inputs = samples["rna_tensor"].float()
            atac_inputs = samples["atac_tensor"].float()
            rna_inputs = rna_inputs.to(device)
            atac_inputs = atac_inputs.to(device)

            _, output_rna, _, _ = netRNA(rna_inputs)
            _, output_atac, _, _ = netATAC(atac_inputs)
            knn_acc.append(knn_criteria(rna_inputs.cpu().detach(), atac_inputs.cpu().detach(), 
                                        output_rna.cpu().detach(), output_atac.cpu().detach()))
        avg_knn_auc = np.mean(knn_acc)
    if epoch % 20 == 0:
        print("Epoch: " + str(epoch) + ", acc: " + str(avg_knn_auc))

In [210]:
#train a toy model and see the scores
max_iter = hyper["nEpochs"]
for epoch in range(max_iter):
    train(epoch)
    evaluate(epoch)
  #set up log
  #if epoch % 50 == 0:
    #print("***saving checkpoints***")
    #path = "{}Max_iter_{}lamb_anc_{}Epoch_{}params.pth".format(hyper["weightDirName"], str(hyper["nEpochs"]), str(hyper["lamb_anc"]), str(epoch))
    
    #torch.save({
    #    "epoch": epoch,
    #    'netRNA_state_dict': netRNA.state_dict(),
    #    'netATAC_state_dict': netATAC.state_dict(),
    # }, path)


Epoch: 0, train loss: 0.7157778785778925
Epoch: 0, acc: 0.1630443084895937
Epoch: 20, train loss: 0.47401968103188735
Epoch: 20, acc: 0.12240874636229951
Epoch: 40, train loss: 0.46023509479486024
Epoch: 40, acc: 0.16551815352795224


In [211]:
def model_eval(test_adata):
    netRNA.eval()
    netATAC.eval()
    rna_inputs = Variable(torch.from_numpy(test_adata.X.toarray()).float())
    atac_inputs = Variable(torch.from_numpy(test_adata.obsm["mode2"].toarray()).float())
    if torch.cuda.is_available():
        rna_inputs = rna_inputs.cuda()
        atac_inputs = atac_inputs.cuda()
    _, z_rna, _, _ = netRNA(rna_inputs)
    _, z_atac, _, _ = netATAC(atac_inputs)
    test_adata.obsm["aligned"] = sparse.csr_matrix(z_rna.cpu().detach())
    test_adata.obsm["mode2_aligned"] = sparse.csr_matrix(z_atac.cpu().detach())
    knn_score, mse_score = metrics.knn_auc(test_adata), metrics.mse(test_adata)
    return knn_score, mse_score

In [212]:
#test knn_auc plateau at around 0.09, seems that training starts to overfit
test_knn_score, test_mse_score = model_eval(test_data)
print(test_knn_score)
print(test_mse_score)
train_knn_score, train_mse_score = model_eval(train_data)
print(train_knn_score)
print(train_mse_score)

0.0958825348422088
0.9997422
0.09558056510033025
1.0048813


In [None]:
#log the metrics
path = "{}Max_iter_{}lamb_anc_{}metrics.txt".format(hyper["weightDirName"], str(hyper["nEpochs"]), str(hyper["lamb_anc"]))
'''torch.save({
    "num_iter": hyper["nEpochs"],
    "lamb_anc": hyper["lamb_anc"],
    'knn_auc': knn_score,
    'mse': mse_score,
}, path)'''

In [None]:
with open(path, 'a') as f:
        print('nEpoch: ', hyper["nEpochs"], 'lamb_anc:%.8f'%float(hyper["lamb_anc"]) , ',knn_auc: %.8f' % float(knn_score), ', mse_score: %.8f' % float(mse_score), file=f)