# Env

In [None]:
%%bash
conda create -n st python==3.7
conda activate st
conda install pandas
conda install numpy
conda install anndata
pip install tangram-sc

# Preprocess data

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import os
import anndata

st_coords = pd.read_csv('../datasets/seqFISH+/Out_rect_locations.csv')
st_coords.columns = ['barcode','','','x','y']
st_coords = st_coords.drop('',axis=1)
st_counts_10000genes = pd.read_csv('../datasets/seqFISH+/Out_gene_expressions_10000genes.csv')
st_counts_10000genes = st_counts_10000genes.rename({'Unnamed: 0':'barcode'}, axis=1)
sc_counts = pd.read_csv('../datasets/seqFISH+/raw_somatosensory_sc_exp.txt',sep='\t')
sc_counts = sc_counts.set_index('cell_id')
sc_counts = sc_counts.T
sc_labels = pd.read_csv('../datasets/seqFISH+/somatosensory_sc_labels.txt',header=None)
sc_labels.columns = ['celltype']
celltype = list(set(sc_labels.celltype))
celltype_dict = dict(zip([x+1 for x in range(len(celltype))],celltype))
metacell_dict = dict(zip([str(x+1) for x in range(len(celltype))],celltype))
sc_labels['cluster'] = [celltype.index(x)+1 for x in sc_labels.celltype]
sc_labels['barcode'] = sc_counts.index

save_dir = '../datasets/data4tangram'
if os.path.isdir(save_dir)==False:
    os.mkdir(save_dir)
    
obs = pd.DataFrame()
obs['x'] = st_coords.x
obs['y'] = st_coords.y
obs.index = st_coords.barcode
var_names = st_counts_10000genes.columns[1::]
var = pd.DataFrame(index=var_names)
X = st_counts_10000genes.iloc[:,1:].values
st_adata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
st_adata.write(f'{save_dir}/st.h5ad')

obs = pd.DataFrame()
obs['celltype'] = sc_labels.celltype
obs['cluster'] = sc_labels.cluster
obs.index = sc_labels.barcode
var_names = sc_counts.columns
var = pd.DataFrame(index=var_names)
X = sc_counts.values
sc_adata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
sc_adata.write(f'{save_dir}/sc.h5ad')

... storing 'celltype' as categorical


# Main

In [2]:
import pandas as pd
import sys
import numpy as np
import pandas as pd
import scanpy as sc
import tangram as tg
import time 

start = time.time()

sample = 'seqFISH+_10000genes'
root_dir = f'../datasets/data4tangram'

ad_sc = sc.read_h5ad(f'{root_dir}/sc.h5ad')
ad_sp = sc.read_h5ad(f'{root_dir}/st.h5ad')
celltype_key = 'celltype'

# use raw count both of scrna and spatial
sc.pp.normalize_total(ad_sc)
celltype_counts = ad_sc.obs[celltype_key].value_counts()
celltype_drop = celltype_counts.index[celltype_counts < 2]
print(f'Drop celltype {list(celltype_drop)} contain less 2 sample')
ad_sc = ad_sc[~ad_sc.obs[celltype_key].isin(celltype_drop),].copy()
sc.tl.rank_genes_groups(ad_sc, groupby=celltype_key, use_raw=False)
markers_df = pd.DataFrame(ad_sc.uns["rank_genes_groups"]["names"]).iloc[0:200, :]
print(markers_df)
genes_sc = np.unique(markers_df.melt().value.values)
print(genes_sc)
genes_st = ad_sp.var_names.values
genes = list(set(genes_sc).intersection(set(genes_st)))

tg.pp_adatas(ad_sc, ad_sp, genes=genes)

ad_map = tg.map_cells_to_space(
                   ad_sc,
                   ad_sp,
                   mode='clusters',
                   cluster_label=celltype_key)

tg.project_cell_annotations(ad_map, ad_sp, annotation=celltype_key)

celltype_density = ad_sp.obsm['tangram_ct_pred']
celltype_density = (celltype_density.T/celltype_density.sum(axis=1)).T

celltype_density.to_csv(f'../seqFISH_10000_Result/{sample}_tangram.tsv',sep='\t')

end = time.time()
print(end - start)

  from .autonotebook import tqdm as notebook_tqdm


Drop celltype [] contain less 2 sample
       Olig astrocytes eNeuron endo_mural      iNeuron microglia
0      Plp1     Atp1a2   Calm1      Sparc        Ndrg4      C1qb
1       Mog        Clu   Ywhah       Esam       Tspyl4       B2m
2    Taldo1     Slc1a3  Snap25      Cldn5  Atp6v0c-ps2     Csf1r
3      Scd2        Mt2   Calm2        Bsg      Zcchc18    Tyrobp
4       Mbp     Slc1a2    Chn1      Csrp2         Gad1     Rps29
..      ...        ...     ...        ...          ...       ...
195  Srd5a1      Kcnk1     Gls       Lrp8    Adcyap1r1    Ifngr1
196  Nkain1      Hadhb   Map1b     Rpl36a         Dpp6      H2-L
197   Enpp6        Oaf  Stxbp1      Nampt       Rab27b      Igf1
198   Magt1       Nfia    Pfkm     Isyna1         Pja1    Hnrnpf
199  Elavl3    Smpdl3a  Lrrc4c    Serinc3        Zmat4     Hspa5

[200 rows x 6 columns]
['1110008P14Rik' '1500004A13Rik' '1700047M11Rik' ... 'Zfr' 'Zmat2' 'Zmat4']


INFO:root:482 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:9807 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 482 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.608, KL reg: 0.324
Score: 0.891, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001
Score: 0.892, KL reg: 0.001


INFO:root:Saving results..
INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.


5.506561994552612
