In [89]:
import tangram as tg
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
import torch
from scipy import stats
from sklearn.metrics import r2_score

from time import time

In [2]:
pct_offset = 0

def MinMaxNorm(Y):
    return (Y-Y.min(axis = 0))/(Y.max(axis = 0)-Y.min(axis = 0))/(1+pct_offset)

def ReMMNorm(Y, Y_pred):
    return (Y_pred*(Y.max(axis=0)-Y.min(axis=0)*(1+pct_offset))+Y.min(axis = 0))

def PolarTrans(Y):
    
    R = np.sqrt(np.square(Y[:,0]) + np.square(Y[:,1]))
    Theta = np.arctan(Y[:,1]/Y[:,0])
    RTheta = np.concatenate([R.reshape(-1,1), Theta.reshape(-1,1)], axis = 1)
    
    return RTheta

def RePolarTrans(RTheta):
    x = RTheta[:,0] * np.cos(RTheta[:,1])
    y = RTheta[:,0] * np.sin(RTheta[:,1])
    Y = np.concatenate([x.reshape(-1,1), y.reshape(-1,1)], axis = 1)
    return Y

In [84]:
adata = ad.read_h5ad('../Dataset/AdataEmbryo1.h5ad')
adata.var_names_make_unique()

Y = adata.obs[['xcoord', 'ycoord']].values
YNorm = MinMaxNorm(Y)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [53]:
train_indices = []
test_indices = []
for i in range(5):
    train_index = pd.read_csv('CV_groups/index_train_' + str(i+1) + '.csv',
                              header = None, index_col = 0).values.flatten()
    test_index = pd.read_csv('CV_groups/index_test_' + str(i+1) + '.csv',
                             header = None, index_col = 0).values.flatten()
    
    train_indices.append(train_index)
    test_indices.append(test_index)

In [61]:
train_indices[0]

array([ 4680, 27793, 26831, ...,  7017, 32234, 33364])

In [60]:
adata[train_indices[0]]

View of AnnData object with n_obs × n_vars = 28279 × 20527
    obs: 'xcoord', 'ycoord', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ERCC', 'pct_counts_ERCC', 'leiden'
    var: 'n_cells', 'mt', 'ERCC', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'hvg'
    obsm: 'X_pca', 'X_umap'

# Running code

In [None]:
ad_maps = []
sta = time()

adata = ad.read_h5ad('../Dataset/AdataMH1.h5ad')
adata.var_names_make_unique()

sc.pp.normalize_total(adata, target_sum = 1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes = 8000)

for i in range(5):

    print(i)
    ad_sp = adata[train_indices[i]]
    ad_sc = adata[test_indices[i]]
    tg.pp_adatas(ad_sc, ad_sp, genes=None)

    ad_map = tg.map_cells_to_space(ad_sc, ad_sp, device = 'cuda')
    ad_maps.append(ad_map)
    torch.cuda.empty_cache()

end = time()
print((end - sta)/60.0, 'mins consumed')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


0


INFO:root:19435 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:19435 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 19435 genes and rna_count_based density_prior in cells mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.120, KL reg: 0.137
Score: 0.375, KL reg: 0.001
Score: 0.400, KL reg: 0.000
Score: 0.409, KL reg: 0.000
Score: 0.413, KL reg: 0.000
Score: 0.417, KL reg: 0.000
Score: 0.419, KL reg: 0.000
Score: 0.421, KL reg: 0.000
Score: 0.422, KL reg: 0.000
Score: 0.423, KL reg: 0.000


In [None]:
len(ad_sc.uns['training_genes'])

In [None]:
i = 0
for i in range(5):
    Y_pred = np.matmul(ad_maps[i].X, Y[train_indices[i]])
    pd.DataFrame(Y_pred).to_csv('SI_Benchmarking/Tangram/pred_' + str(i) + '.csv', index = None)

In [64]:
Y_pred

array([[3107.59604494, 3229.44047571],
       [3558.652212  , 3088.45137994],
       [3199.65489801, 3185.65921776],
       ...,
       [2871.82632982, 3491.64458997],
       [3146.28042943, 3242.12208494],
       [3191.7016737 , 2892.28896944]])

In [36]:
pd.DataFrame(Y_pred).to_csv('SI_Benchmarking/Tangram/pred_' + str(4) + '.csv', index = None)

In [27]:
len(ad_sc.uns['training_genes'])

19435

In [15]:
help(tg.pp_adatas)

Help on function pp_adatas in module tangram.mapping_utils:

pp_adatas(adata_sc, adata_sp, genes=None)
    Pre-process AnnDatas so that they can be mapped. Specifically:
    - Remove genes that all entries are zero
    - Find the intersection between adata_sc, adata_sp and given marker gene list, save the intersected markers in two adatas
    - Calculate density priors and save it with adata_sp
    
    Args:
        adata_sc (AnnData): single cell data
        adata_sp (AnnData): spatial expression data
        genes (List): Optional. List of genes to use. If `None`, all genes are used.
    
    Returns:
        update adata_sc by creating `uns` `training_genes` `overlap_genes` fields 
        update adata_sp by creating `uns` `training_genes` `overlap_genes` fields and creating `obs` `rna_count_based_density` & `uniform_density` field



In [None]:
adata