In [1]:
#notebook for integrating spatial and scrna seq data
#uses scanpy_env, need to start jupyter notebook in shell with scanpy_env activated, then port forward from local machine

In [183]:
import scanpy as sc
import squidpy as sq
import numpy as np
import pandas as pd
from anndata import AnnData
import pathlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import skimage
import seaborn as sns
import tangram as tg
import json

sc.logging.print_header()
print(f"squidpy=={sq.__version__}")

%load_ext autoreload
%autoreload 2
%matplotlib inline

scanpy==1.10.2 anndata==0.10.8 umap==0.5.6 numpy==1.26.4 scipy==1.14.0 pandas==2.2.2 scikit-learn==1.5.1 statsmodels==0.14.2 igraph==0.11.6 pynndescent==0.5.13
squidpy==1.5.0
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Datasets

In [17]:
#example spatial for squidpy
adata_st_ex = sq.datasets.visium_fluo_adata_crop()

In [18]:
adata_st_ex

AnnData object with n_obs × n_vars = 704 × 16562
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_MT', 'log1p_total_counts_MT', 'pct_counts_MT', 'n_counts', 'leiden', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'cluster_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'spatial', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [19]:
adata_st_ex.obsm['spatial'].shape

(704, 2)

In [20]:
adata_st_ex.obs['n_genes_by_counts']

AAACGAGACGGTTGAT-1    3808
AAAGGGATGTAGCAAG-1    5288
AAATGGCATGTCTTGT-1    5191
AAATGGTCAATGTGCC-1    4923
AAATTAACGGGTAGCT-1    4045
                      ... 
TTGTCGTTCAGTTACC-1    3443
TTGTGGCCCTGACAGT-1    4971
TTGTTAGCAAATTCGA-1    4820
TTGTTCAGTGTGCTAC-1    3372
TTGTTGTGTGTCAAGA-1    3768
Name: n_genes_by_counts, Length: 704, dtype: int32

In [None]:

adata_st_test = sc.read_visium()

In [7]:
adata_sc = sq.datasets.sc_mouse_cortex()

100%|████████████████████████████████████| 3.03G/3.03G [29:28<00:00, 1.84MB/s]


In [24]:
adata_sc

AnnData object with n_obs × n_vars = 21697 × 36826
    obs: 'sample_name', 'organism', 'donor_sex', 'cell_class', 'cell_subclass', 'cell_cluster', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'cell_class_colors', 'cell_subclass_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [103]:
adata_sc.var

Unnamed: 0,mt,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
0610005C13Rik,False,872,0.524247,0.421501,96.237812,12151.0,9.405249,867,False,,0.534452,29.752143,1.090297
0610006L08Rik,False,29,0.049918,0.048712,99.874881,1157.0,7.054450,29,False,,0.050422,5.631491,2.697598
0610007P14Rik,False,21519,161.839111,5.092762,7.157649,3751107.0,15.137562,21344,False,,164.495598,14562.080191,0.629189
0610009B22Rik,False,21203,150.453445,5.020278,8.521011,3487210.0,15.064612,21044,False,,153.827165,13897.739024,0.652549
0610009E02Rik,False,4185,1.559280,0.939726,81.944085,36141.0,10.495211,4166,False,,1.591464,51.485746,0.484592
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zzef1,False,16165,38.780094,3.683367,30.257140,898845.0,13.708867,16034,False,,38.959856,4371.811764,0.873219
Zzz3,False,16354,50.861378,3.948574,29.441712,1178865.0,13.980063,16227,False,,51.304835,6362.498978,0.978110
a,False,188,0.151092,0.140711,99.188886,3502.0,8.161375,185,True,3379.0,0.152878,16.621585,3.142282
l7Rn6,False,21904,208.119507,5.342906,5.496592,4823794.0,15.389071,21728,False,,213.394617,18848.619805,0.581846


In [25]:
adata_sc.obs['cell_subclass']

F2S4_151217_005_B01    Pvalb
F2S4_151217_005_C01       L4
F2S4_151217_005_E01       L4
F2S4_151217_005_F01       L4
F2S4_151217_005_G01       L4
                       ...  
F1S4_180124_317_D01    Lamp5
F1S4_180124_317_E01     Sncg
F1S4_180124_317_F01      Sst
F1S4_180124_317_G01     Sncg
F1S4_180124_317_H01     Sncg
Name: cell_subclass, Length: 21697, dtype: category
Categories (23, object): ['Astro', 'CR', 'Endo', 'L2/3 IT', ..., 'Sncg', 'Sst', 'VLMC', 'Vip']

# Load and Format SD Data

In [280]:
gaba_folder = '/bigdata/isaac/gaba_files/'
#load in all filtered metadata with subclass labels
with open(gaba_folder + 'GABAmeta_data_df_plis_filtered_markers_2024-09-02.json') as json_data:
    gaba_metadata = json.load(json_data)
gaba_metadata = pd.DataFrame.from_dict(gaba_metadata, orient='columns')

In [30]:
gaba_metadata.shape

(30, 10022)

In [281]:
vglut1_folder = '/bigdata/isaac/Vglut1_files/'
#load in all filtered metadata with subclass labels
with open(vglut1_folder + 'VGLUT1meta_data_df_plis_filtered_markers_2024-09-01.json') as json_data:
    vglut1_metadata = json.load(json_data)
vglut1_metadata = pd.DataFrame.from_dict(vglut1_metadata, orient='columns')

In [32]:
vglut1_metadata.shape

(30, 9550)

In [282]:
vglut2_folder = '/bigdata/isaac/Vglut2_files/'
#load in all filtered metadata with subclass labels
with open(vglut2_folder + 'VGLUT2meta_data_df_plis_filtered_markers_2024-09-01.json') as json_data:
    vglut2_metadata = json.load(json_data)
vglut2_metadata = pd.DataFrame.from_dict(vglut2_metadata, orient='columns')

In [36]:
vglut2_metadata.shape

(30, 5276)

In [283]:
nn_folder = '/bigdata/isaac/Nonneuronal_files/'
#load in all filtered metadata with subclass labels
with open(nn_folder + 'NNmeta_data_df_plis_filtered_markers_2024-09-18.json') as json_data:
    nn_metadata = json.load(json_data)
nn_metadata = pd.DataFrame.from_dict(nn_metadata, orient='columns')

In [43]:
nn_metadata.shape

(30, 1976)

In [None]:
#add dash at end of single cluster labels for each class to fix obs.columns matching var names issue downstream

In [241]:
np.unique(gaba_metadata.loc['cluster_label'])

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
      dtype=object)

In [251]:
gaba_metadata.loc['markers' ,gaba_metadata.loc['markers'] == 'Tmem100'] = 'Tmem100-'

In [253]:
gaba_metadata.loc['markers' ,gaba_metadata.loc['markers'] == 'Lpl'] = 'Lpl-'

In [254]:
gaba_metadata.loc['markers' ,gaba_metadata.loc['markers'] == 'Npy2r'] = 'Npy2r-'

In [255]:
gaba_metadata.loc['markers' ,gaba_metadata.loc['markers'] == 'Nwd2'] = 'Nwd2-'

In [260]:
gaba_metadata.loc['markers' ,gaba_metadata.loc['markers'] == 'Myh7'] = 'Myh7-'

In [266]:
vglut1_metadata.loc['markers' ,vglut1_metadata.loc['markers'] == 'Lypd6'] = 'Lypd6-'

In [267]:
vglut1_metadata.loc['markers' ,vglut1_metadata.loc['markers'] == 'Moxd1'] = 'Moxd1-'

In [268]:
vglut1_metadata.loc['markers' ,vglut1_metadata.loc['markers'] == 'Gfod2'] = 'Gfod2-'

In [269]:
vglut1_metadata.loc['markers' ,vglut1_metadata.loc['markers'] == 'Rab3b'] = 'Rab3b-'

In [None]:
vglut2_metadata.loc['markers' ,vglut2_metadata.loc['markers'] == 'Lmo1'] = 'Lmo1-'

In [270]:
vglut2_metadata.loc['markers' ,vglut2_metadata.loc['markers'] == 'Ebf1'] = 'Ebf1-'

In [271]:
vglut2_metadata.loc['markers' ,vglut2_metadata.loc['markers'] == 'Pgr15l'] = 'Pgr15l-'

In [None]:
vglut2_metadata.loc['markers' ,vglut2_metadata.loc['markers'] == 'Pgr15l'] = 'Pgr15l-'

In [272]:
nn_metadata.loc['markers' ,nn_metadata.loc['markers'] == 'Ccl2'] = 'Ccl2-'

In [None]:
#concat into one 

In [284]:
combined_metadata = pd.concat([gaba_metadata,vglut1_metadata,vglut2_metadata,nn_metadata],axis=1)
combined_metadata

Unnamed: 0,CAACAACAGACATATG-1_10X35_2,ATGACCATCGTGAGAG-1_10X51_1,AACCCAAAGAAATGGG-1_10X35_2,GAGGGATGTATGCTAC-1_10X35_1,AAGATAGGTAACACCT-1_10X37_2,CAGGGCTGTATCTTCT-1_10X51_1,CTGCTCATCCGATCGG-1_10X38_1,TGCTCCAAGAGGCGTT-1_10X51_1,TTGCCTGTCGTAGGGA-1_10X35_2,GCAACATAGATCGACG-1_10X51_2,...,CAAGAGGGTCAATCTG-1_10X36_2,TATACCTAGAGTAACT-1_10X35_1,GCACGTGGTTATTCCT-1_10X37_1,GATGATCTCAAGCCTA-1_10X36_1,GGTGTCGAGAAGCTGC-1_10X36_2,GAGAGGTCACATCATG-1_10X37_2,ATCAGGTTCCACTAGA-1_10X35_2,TCCACGTAGAGCATCG-1_10X37_1,CTGCGAGCACTACGGC-1_10X35_2,CTCTGGTTCGAAGCAG-1_10X37_1
Serial_Number,64.0,97.0,64.0,63.0,68.0,97.0,69.0,97.0,64.0,98.0,...,66.0,63.0,67.0,65.0,66.0,68.0,64.0,67.0,64.0,67.0
Date_Captured,2019-11-20,05/31/20,2019-11-20,2019-11-20,2019-11-25,05/31/20,2019-12-01,05/31/20,2019-11-20,05/31/20,...,2019-11-24,2019-11-20,2019-11-25,2019-11-24,2019-11-24,2019-11-25,2019-11-20,2019-11-25,2019-11-20,2019-11-25
Species,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,...,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,...,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,,C57Bl/6,,,,C57Bl/6,,C57Bl/6,,C57Bl/6,...,,,,,,,,,,
Project,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,...,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph
Group,Naïve-F,Breeder-F,Naïve-F,Naïve-F,Naïve-M,Breeder-F,Naïve-F,Breeder-F,Naïve-F,Breeder-F,...,Naïve-M,Naïve-F,Naïve-M,Naïve-M,Naïve-M,Naïve-M,Naïve-F,Naïve-M,Naïve-F,Naïve-M
ChipID,10X35,10X51,10X35,10X35,10X37,10X51,10X38,10X51,10X35,10X51,...,10X36,10X35,10X37,10X36,10X36,10X37,10X35,10X37,10X35,10X37
SampleID,10X35_2,10X51_1,10X35_2,10X35_1,10X37_2,10X51_1,10X38_1,10X51_1,10X35_2,10X51_2,...,10X36_2,10X35_1,10X37_1,10X36_1,10X36_2,10X37_2,10X35_2,10X37_1,10X35_2,10X37_1
DonorID,"DI1,DI2",DI-B1-F,"DI1,DI2","DI1,DI2","DI4,DI5",DI-B1-F,DI6,DI-B1-F,"DI1,DI2",DI-B1-F,...,DI3,"DI1,DI2","DI4,DI5",DI3,DI3,"DI4,DI5","DI1,DI2","DI4,DI5","DI1,DI2","DI4,DI5"


In [None]:
#load original expression data

In [39]:
df_orig = pd.read_feather('/bigdata/isaac/df_orig.feather')

In [98]:
df_orig

Unnamed: 0_level_0,AAACCCAAGAGCATAT-1_10X51_2,AAACCCAAGCCTCAGC-1_10X51_2,AAACCCAAGTACTGTC-1_10X51_2,AAACCCACACAACGTT-1_10X51_2,AAACCCAGTAACATAG-1_10X51_2,AAACCCATCACTAGCA-1_10X51_2,AAACGAACACTCCGAG-1_10X51_2,AAACGAACAGCGCGTT-1_10X51_2,AAACGAACAGGACTTT-1_10X51_2,AAACGAAGTGAATAAC-1_10X51_2,...,TTTGGTTTCCACGAAT-1_10X37_2,TTTGGTTTCTTTCTAG-1_10X37_2,TTTGTTGAGAAATTCG-1_10X37_2,TTTGTTGAGTGCTCGC-1_10X37_2,TTTGTTGCAAGGTCAG-1_10X37_2,TTTGTTGCACCGCTGA-1_10X37_2,TTTGTTGGTCCATAGT-1_10X37_2,TTTGTTGGTTGCGAAG-1_10X37_2,TTTGTTGTCGAAGGAC-1_10X37_2,TTTGTTGTCTCTGCTG-1_10X37_2
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik,0,2,0,2,0,0,1,3,2,0,...,1,0,1,0,2,1,1,0,0,0
0610009B22Rik,0,0,0,1,0,2,2,0,1,0,...,0,0,0,0,2,0,0,0,0,0
0610009L18Rik,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,1,0,0,0
0610009O20Rik,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610010F05Rik,0,0,1,0,0,0,2,1,1,0,...,1,0,2,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,31,24,7,13,7,11,12,30,29,14,...,10,6,34,15,33,12,2,4,4,1
mt-Nd4,64,90,12,50,16,58,71,87,88,81,...,90,40,170,69,130,67,33,31,38,31
mt-Nd4l,8,12,2,5,4,10,7,5,12,9,...,14,4,13,4,8,5,9,7,3,5
mt-Nd5,14,20,6,9,1,12,8,10,11,10,...,17,5,24,4,18,13,6,2,4,6


In [100]:
df = df_orig.reindex(columns=combined_metadata.columns)
df

Unnamed: 0_level_0,CAACAACAGACATATG-1_10X35_2,ATGACCATCGTGAGAG-1_10X51_1,AACCCAAAGAAATGGG-1_10X35_2,GAGGGATGTATGCTAC-1_10X35_1,AAGATAGGTAACACCT-1_10X37_2,CAGGGCTGTATCTTCT-1_10X51_1,CTGCTCATCCGATCGG-1_10X38_1,TGCTCCAAGAGGCGTT-1_10X51_1,TTGCCTGTCGTAGGGA-1_10X35_2,GCAACATAGATCGACG-1_10X51_2,...,CAAGAGGGTCAATCTG-1_10X36_2,TATACCTAGAGTAACT-1_10X35_1,GCACGTGGTTATTCCT-1_10X37_1,GATGATCTCAAGCCTA-1_10X36_1,GGTGTCGAGAAGCTGC-1_10X36_2,GAGAGGTCACATCATG-1_10X37_2,ATCAGGTTCCACTAGA-1_10X35_2,TCCACGTAGAGCATCG-1_10X37_1,CTGCGAGCACTACGGC-1_10X35_2,CTCTGGTTCGAAGCAG-1_10X37_1
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik,3,2,2,0,1,2,2,2,2,0,...,2,1,2,0,3,3,1,0,3,2
0610009B22Rik,1,1,0,2,1,0,0,3,0,1,...,0,1,0,0,0,0,0,0,1,2
0610009L18Rik,0,0,0,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
0610009O20Rik,2,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
0610010F05Rik,5,0,1,6,7,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,49,13,19,42,26,9,14,9,12,26,...,16,12,33,26,16,32,16,24,22,39
mt-Nd4,252,74,233,591,340,82,43,89,119,39,...,230,71,147,148,75,106,59,95,87,126
mt-Nd4l,22,2,7,21,16,4,3,2,9,8,...,8,6,9,4,6,14,5,5,4,6
mt-Nd5,34,9,27,74,26,10,5,15,9,7,...,20,3,14,9,6,13,7,11,7,8


In [110]:
adata_sc.obs

Unnamed: 0,sample_name,organism,donor_sex,cell_class,cell_subclass,cell_cluster,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,n_counts
F2S4_151217_005_B01,F2S4_151217_005_B01,Mus musculus,M,GABAergic,Pvalb,Pvalb Tpbg,8542,9.052868,1282648.0,14.064438,16.040254,22.379172,30.921110,45.899343,0.0,0.0,0.0,1282648.0
F2S4_151217_005_C01,F2S4_151217_005_C01,Mus musculus,M,Glutamatergic,L4,L4 IT VISp Rspo1,8111,9.001100,1129496.0,13.937283,14.022449,18.904626,25.959809,39.415102,0.0,0.0,0.0,1129496.0
F2S4_151217_005_E01,F2S4_151217_005_E01,Mus musculus,M,Glutamatergic,L4,L4 IT VISp Rspo1,8780,9.080346,1557742.0,14.258749,13.580490,19.555485,27.911361,42.299880,0.0,0.0,0.0,1557742.0
F2S4_151217_005_F01,F2S4_151217_005_F01,Mus musculus,M,Glutamatergic,L4,L4 IT VISp Rspo1,8498,9.047704,1306856.0,14.083136,13.945989,19.042113,26.350723,40.343236,0.0,0.0,0.0,1306856.0
F2S4_151217_005_G01,F2S4_151217_005_G01,Mus musculus,M,Glutamatergic,L4,L4 IT VISp Rspo1,7566,8.931552,1119824.0,13.928683,14.102752,19.419570,27.235976,41.983919,0.0,0.0,0.0,1119824.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F1S4_180124_317_D01,F1S4_180124_317_D01,Mus musculus,F,GABAergic,Lamp5,Lamp5 Lsp1,8601,9.059750,1022536.0,13.837797,18.500473,24.796584,32.826326,47.307968,0.0,0.0,0.0,1022536.0
F1S4_180124_317_E01,F1S4_180124_317_E01,Mus musculus,F,GABAergic,Sncg,Sncg Slc17a8,9823,9.192584,724679.0,13.493485,18.947148,25.948316,34.913665,50.023942,0.0,0.0,0.0,724679.0
F1S4_180124_317_F01,F1S4_180124_317_F01,Mus musculus,F,GABAergic,Sst,Sst Hpse Sema3c,8730,9.074635,980851.0,13.796177,18.740053,25.395294,33.757217,48.250346,0.0,0.0,0.0,980851.0
F1S4_180124_317_G01,F1S4_180124_317_G01,Mus musculus,F,GABAergic,Sncg,Sncg Slc17a8,10633,9.271812,1029766.0,13.844843,17.725775,24.532078,33.706881,48.938594,0.0,0.0,0.0,1029766.0


In [119]:
cells = df.columns.to_frame()
cells = cells.rename(columns = {0:'sample_name'})

In [121]:
adata_sc_sd = sc.AnnData(df.T, obs=cells, var=df.index.to_frame())

In [122]:
adata_sc_sd

AnnData object with n_obs × n_vars = 26824 × 27933
    obs: 'sample_name'
    var: 'gene'

In [132]:
combined_metadata.loc['cell_class']

CAACAACAGACATATG-1_10X35_2           GABA
ATGACCATCGTGAGAG-1_10X51_1           GABA
AACCCAAAGAAATGGG-1_10X35_2           GABA
GAGGGATGTATGCTAC-1_10X35_1           GABA
AAGATAGGTAACACCT-1_10X37_2           GABA
                                 ...     
GAGAGGTCACATCATG-1_10X37_2    Nonneuronal
ATCAGGTTCCACTAGA-1_10X35_2    Nonneuronal
TCCACGTAGAGCATCG-1_10X37_1    Nonneuronal
CTGCGAGCACTACGGC-1_10X35_2    Nonneuronal
CTCTGGTTCGAAGCAG-1_10X37_1    Nonneuronal
Name: cell_class, Length: 26824, dtype: object

In [201]:
combined_metadata.loc['markers']

CAACAACAGACATATG-1_10X35_2    Chat-Isl1
ATGACCATCGTGAGAG-1_10X51_1    Chat-Isl1
AACCCAAAGAAATGGG-1_10X35_2    Chat-Isl1
GAGGGATGTATGCTAC-1_10X35_1    Chat-Isl1
AAGATAGGTAACACCT-1_10X37_2    Chat-Isl1
                                ...    
GAGAGGTCACATCATG-1_10X37_2         OL_2
ATCAGGTTCCACTAGA-1_10X35_2         OL_2
TCCACGTAGAGCATCG-1_10X37_1         OL_2
CTGCGAGCACTACGGC-1_10X35_2         OL_2
CTCTGGTTCGAAGCAG-1_10X37_1         OL_2
Name: markers, Length: 26824, dtype: object

In [285]:
#make lowercase
combined_metadata.loc['markers'].str.lower()

CAACAACAGACATATG-1_10X35_2    chat-isl1
ATGACCATCGTGAGAG-1_10X51_1    chat-isl1
AACCCAAAGAAATGGG-1_10X35_2    chat-isl1
GAGGGATGTATGCTAC-1_10X35_1    chat-isl1
AAGATAGGTAACACCT-1_10X37_2    chat-isl1
                                ...    
GAGAGGTCACATCATG-1_10X37_2         ol_2
ATCAGGTTCCACTAGA-1_10X35_2         ol_2
TCCACGTAGAGCATCG-1_10X37_1         ol_2
CTGCGAGCACTACGGC-1_10X35_2         ol_2
CTCTGGTTCGAAGCAG-1_10X37_1         ol_2
Name: markers, Length: 26824, dtype: object

In [133]:
adata_sc_sd.obs['cell_class'] = combined_metadata.loc['cell_class']

In [286]:
adata_sc_sd.obs['cell_subclass'] = combined_metadata.loc['markers'].str.lower()

In [287]:
output_path = '/bigdata/isaac/tangram_integration/'
adata_sc_sd.write(output_path +'adata_sc_sd.h5ad', compression='gzip')

In [127]:
adata_sc.var

Unnamed: 0,mt,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
0610005C13Rik,False,872,0.524247,0.421501,96.237812,12151.0,9.405249,867,False,,0.534452,29.752143,1.090297
0610006L08Rik,False,29,0.049918,0.048712,99.874881,1157.0,7.054450,29,False,,0.050422,5.631491,2.697598
0610007P14Rik,False,21519,161.839111,5.092762,7.157649,3751107.0,15.137562,21344,False,,164.495598,14562.080191,0.629189
0610009B22Rik,False,21203,150.453445,5.020278,8.521011,3487210.0,15.064612,21044,False,,153.827165,13897.739024,0.652549
0610009E02Rik,False,4185,1.559280,0.939726,81.944085,36141.0,10.495211,4166,False,,1.591464,51.485746,0.484592
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zzef1,False,16165,38.780094,3.683367,30.257140,898845.0,13.708867,16034,False,,38.959856,4371.811764,0.873219
Zzz3,False,16354,50.861378,3.948574,29.441712,1178865.0,13.980063,16227,False,,51.304835,6362.498978,0.978110
a,False,188,0.151092,0.140711,99.188886,3502.0,8.161375,185,True,3379.0,0.152878,16.621585,3.142282
l7Rn6,False,21904,208.119507,5.342906,5.496592,4823794.0,15.389071,21728,False,,213.394617,18848.619805,0.581846


In [137]:
adata_sc.obs['cell_class']

F2S4_151217_005_B01        GABAergic
F2S4_151217_005_C01    Glutamatergic
F2S4_151217_005_E01    Glutamatergic
F2S4_151217_005_F01    Glutamatergic
F2S4_151217_005_G01    Glutamatergic
                           ...      
F1S4_180124_317_D01        GABAergic
F1S4_180124_317_E01        GABAergic
F1S4_180124_317_F01        GABAergic
F1S4_180124_317_G01        GABAergic
F1S4_180124_317_H01        GABAergic
Name: cell_class, Length: 21697, dtype: category
Categories (4, object): ['Endothelial', 'GABAergic', 'Glutamatergic', 'Non-Neuronal']

In [139]:
adata_sc_sd

AnnData object with n_obs × n_vars = 26824 × 27933
    obs: 'sample_name', 'cell_class', 'cell_subclass'
    var: 'gene'

In [140]:
#from tutorial
sc.tl.rank_genes_groups(adata_sc, groupby="cell_subclass", use_raw=False)
markers_df = pd.DataFrame(adata_sc.uns["rank_genes_groups"]["names"]).iloc[0:100, :]
markers = list(np.unique(markers_df.melt().value.values))
len(markers)


1401

In [141]:
markers

['1110008P14Rik',
 '1700001L19Rik',
 '1700019D03Rik',
 '1700019L22Rik',
 '1700047M11Rik',
 '2010111I01Rik',
 '2010300C02Rik',
 '2610001J05Rik',
 '2610028E06Rik',
 '2810468N07Rik',
 '2900011O08Rik',
 '2900055J20Rik',
 '3110035E14Rik',
 '4632428N05Rik',
 '6330403A02Rik',
 '6330403K07Rik',
 '9130024F11Rik',
 '9430020K01Rik',
 'A330050F15Rik',
 'A530058N18Rik',
 'A830018L16Rik',
 'AF251705',
 'ATP6',
 'Abat',
 'Abcb1a',
 'Abcc9',
 'Abcg2',
 'Abracl',
 'Ace2',
 'Ache',
 'Ackr3',
 'Acot7',
 'Acsbg1',
 'Acsl3',
 'Acta2',
 'Actb',
 'Actr3b',
 'Acvrl1',
 'Adamts19',
 'Adap2',
 'Adarb2',
 'Adcy1',
 'Adcy2',
 'Adcyap1',
 'Adora1',
 'Adra1a',
 'Adra1b',
 'Afap1',
 'Ahi1',
 'Ahnak',
 'Aif1',
 'Ak4',
 'Ak5',
 'Alcam',
 'Aldh1a2',
 'Aldoc',
 'Alox5ap',
 'Amigo2',
 'Anapc11',
 'Ank1',
 'Ankrd29',
 'Ankrd33b',
 'Ankrd35',
 'Ano3',
 'Anxa2',
 'Anxa3',
 'Anxa5',
 'Ap1s2',
 'Apod',
 'Apoe',
 'Apold1',
 'Appl2',
 'Aqp4',
 'Araf',
 'Arhgap25',
 'Arhgap29',
 'Arhgap31',
 'Arhgap32',
 'Arhgdib',
 'Arhgdig',
 

In [148]:
#concat marker dfs for each class to get markers_sd
markers_df = pd.read_feather('/bigdata/isaac/df_marker_2024-05-19.feather')
markers = sorted(markers_df.index)

In [156]:
len(markers)

262

# Load subsetted visium spatial data

In [207]:
adata_st_test_subset = sc.read_h5ad('/bigdata/isaac/tangram_integration/adata_st_test_subset.h5ad')

In [208]:
adata_st_test_subset

AnnData object with n_obs × n_vars = 582 × 15091
    obs: 'in_tissue', 'array_row', 'array_col', 'thing', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts', 'clusters'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'spatial'
    obsm: 'spatial'

In [209]:
tg.pp_adatas(adata_sc_sd, adata_st_test_subset, genes=markers, gene_to_lowercase = True)

INFO:root:249 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:14928 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.


In [210]:
ad_map = tg.map_cells_to_space(adata_sc_sd, adata_st_test_subset,
    #mode="cells",
    mode="clusters",
    cluster_label='cell_subclass',  # .obs field w cell types
    density_prior='rna_count_based',
    num_epochs=500,
    # device="cuda:0",
    device='cpu',
)

INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 249 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.313, KL reg: 0.030
Score: 0.580, KL reg: 0.001
Score: 0.581, KL reg: 0.001
Score: 0.581, KL reg: 0.001
Score: 0.581, KL reg: 0.001


INFO:root:Saving results..


In [211]:
ad_map

AnnData object with n_obs × n_vars = 103 × 582
    obs: 'cell_subclass', 'cluster_density'
    var: 'in_tissue', 'array_row', 'array_col', 'thing', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts', 'clusters', 'uniform_density', 'rna_count_based_density'
    uns: 'train_genes_df', 'training_history'

In [288]:
tg.project_cell_annotations(ad_map, adata_st_test_subset, annotation="cell_subclass")
annotation_list = list(pd.unique(adata_sc_sd.obs['cell_subclass']))
tg.plot_cell_annotation_sc(adata_st_test_subset, annotation_list,perc=0.02)

INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.


ValueError: Key tmem100 could be found in both .var_names and .obs.columns

In [278]:
annotation_list

['chat-isl1',
 'sncg-htr3a',
 'vip-cbln2',
 'cpne4-pde3a',
 'reln-ndnf',
 'dab1-nfib',
 'kit-pdlim5',
 'hcrtr2-id2',
 'pthlh-pvalb',
 'rbp4-kitl',
 'moxd1-vwc2',
 'maf-mafb',
 'npy-sst',
 'spon1-tox',
 'gm28884-fam159b',
 'st18-gm17660',
 'calcr-peg10',
 'prlr-pde1c',
 'cbln1-grp',
 'tmem100-',
 'unc13c-crtac1',
 'igfbp6-rprm',
 'wfs1-prok2',
 'jsrp1-col18a1',
 'lpl-',
 'rmst-mgat4c',
 'oprk1-trhde',
 'lmo1-chn2',
 'gal-asb4',
 'igsf1-a230065h16rik',
 'foxp2-gpr88',
 'npy2r-',
 'nwd2-',
 'lypd1',
 'hs3st4-car10',
 'bc039966-ngb',
 'col23a1',
 'col6a1',
 'myh7-',
 'htr1f-tshz1',
 'col11a1-pde11a',
 'pax6-npnt',
 'calca',
 'dsc3-mfge8',
 'nrn1-slc17a7',
 'pou3f2-zfhx3',
 'cyp26b1-dach1',
 'ptn-ndst4',
 'megf11-gpc5',
 'bc048546-dlx1',
 'sim1-arhgdib',
 'gm11549-prox1',
 'il33-fbln1',
 'cd44-cplx3',
 'dcn-arhgap6',
 '2410022m11rik-cd24a',
 'pde11a-vgll3',
 'lypd6-',
 'bdnf-pcdh11x',
 'igfbp4-mgp',
 'cpne2-adgrg6',
 'tac1-angpt1',
 'sema5a-medag',
 'snca-foxp2',
 'spon1-tmem215',
 'rxfp1-f

In [228]:
np.where(adata_st_test_subset.obs.columns == 'tmem100')

(array([38]),)

In [231]:
adata_st_test_subset.obs.columns[38] = 'tmem100-'

TypeError: Index does not support mutable operations

In [219]:
list(adata_st_test_subset.obs.columns)

['in_tissue',
 'array_row',
 'array_col',
 'thing',
 'n_genes_by_counts',
 'log1p_n_genes_by_counts',
 'total_counts',
 'log1p_total_counts',
 'pct_counts_in_top_50_genes',
 'pct_counts_in_top_100_genes',
 'pct_counts_in_top_200_genes',
 'pct_counts_in_top_500_genes',
 'total_counts_mt',
 'log1p_total_counts_mt',
 'pct_counts_mt',
 'n_counts',
 'clusters',
 'uniform_density',
 'rna_count_based_density',
 'chat-isl1',
 'sncg-htr3a',
 'vip-cbln2',
 'cpne4-pde3a',
 'reln-ndnf',
 'dab1-nfib',
 'kit-pdlim5',
 'hcrtr2-id2',
 'pthlh-pvalb',
 'rbp4-kitl',
 'moxd1-vwc2',
 'maf-mafb',
 'npy-sst',
 'spon1-tox',
 'gm28884-fam159b',
 'st18-gm17660',
 'calcr-peg10',
 'prlr-pde1c',
 'cbln1-grp',
 'tmem100',
 'unc13c-crtac1',
 'igfbp6-rprm',
 'wfs1-prok2',
 'jsrp1-col18a1',
 'lpl',
 'rmst-mgat4c',
 'oprk1-trhde',
 'lmo1-chn2',
 'gal-asb4',
 'igsf1-a230065h16rik',
 'foxp2-gpr88',
 'npy2r',
 'nwd2',
 'lypd1',
 'hs3st4-car10',
 'bc039966-ngb',
 'col23a1',
 'col6a1',
 'myh7',
 'htr1f-tshz1',
 'col11a1-pde

In [214]:
tg.plot_training_scores(ad_map, bins=20, alpha=.5)

In [279]:
sc.pl.spatial(
        adata_st_test_subset, color=annotation_list, cmap="viridis", show=False, frameon=False, spot_size=50,
        scale_factor=.32, alpha_img=1.0, bw=False, ax=None
    )

KeyError: 'Could not find key chat-isl1 in .var_names or .obs.columns.'

In [216]:
list(adata_st_test_subset.obs.columns)

['in_tissue',
 'array_row',
 'array_col',
 'thing',
 'n_genes_by_counts',
 'log1p_n_genes_by_counts',
 'total_counts',
 'log1p_total_counts',
 'pct_counts_in_top_50_genes',
 'pct_counts_in_top_100_genes',
 'pct_counts_in_top_200_genes',
 'pct_counts_in_top_500_genes',
 'total_counts_mt',
 'log1p_total_counts_mt',
 'pct_counts_mt',
 'n_counts',
 'clusters',
 'uniform_density',
 'rna_count_based_density',
 'chat-isl1',
 'sncg-htr3a',
 'vip-cbln2',
 'cpne4-pde3a',
 'reln-ndnf',
 'dab1-nfib',
 'kit-pdlim5',
 'hcrtr2-id2',
 'pthlh-pvalb',
 'rbp4-kitl',
 'moxd1-vwc2',
 'maf-mafb',
 'npy-sst',
 'spon1-tox',
 'gm28884-fam159b',
 'st18-gm17660',
 'calcr-peg10',
 'prlr-pde1c',
 'cbln1-grp',
 'tmem100',
 'unc13c-crtac1',
 'igfbp6-rprm',
 'wfs1-prok2',
 'jsrp1-col18a1',
 'lpl',
 'rmst-mgat4c',
 'oprk1-trhde',
 'lmo1-chn2',
 'gal-asb4',
 'igsf1-a230065h16rik',
 'foxp2-gpr88',
 'npy2r',
 'nwd2',
 'lypd1',
 'hs3st4-car10',
 'bc039966-ngb',
 'col23a1',
 'col6a1',
 'myh7',
 'htr1f-tshz1',
 'col11a1-pde

In [182]:
#repeat but with full spatial data...
adata_st_test = sc.read_h5ad('/bigdata/isaac/tangram_integration/adata_st_test.h5ad')
tg.pp_adatas(adata_sc_sd, adata_st_test, genes=markers)
ad_map = tg.map_cells_to_space(adata_sc_sd, adata_st_test,
    #mode="cells",
    mode="clusters",
    cluster_label='cell_subclass',  # .obs field w cell types
    density_prior='rna_count_based',
    num_epochs=500,
    # device="cuda:0",
    device='cpu',
)
tg.project_cell_annotations(ad_map, adata_st_test, annotation="cell_subclass")
annotation_list = list(pd.unique(adata_sc_sd.obs['cell_subclass']))
tg.plot_cell_annotation_sc(adata_st_test, annotation_list,perc=0.02)

INFO:root:257 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:15080 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 257 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.300, KL reg: 0.026
Score: 0.576, KL reg: 0.001
Score: 0.577, KL reg: 0.001
Score: 0.578, KL reg: 0.001
Score: 0.578, KL reg: 0.001


INFO:root:Saving results..
INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.
