In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
from anndata import AnnData
from scipy.sparse import issparse
from scipy.sparse import csr_array
import anndata as ad

In [2]:
adata_exp0 = ad.read_h5ad('C:/Users/mdichgan/Documents/Helmholtz/send_to_Jakob/spatial/counts_CPc_exp0_BA28.h5ad')
adata_Yao = ad.read_h5ad(
    'C:/Users/mdichgan/Documents/Helmholtz/send_to_Jakob/sc/Yao_150kcells_subsample_with_annotations_sparse_subset.h5ad')

In [3]:
adata_Yao.obs["celltype"] = adata_Yao.obs["label"]
adata_exp0.layers["raw"] = adata_exp0.X
adata_Yao.layers["raw"] = adata_Yao.X

# sc.pp.normalize_total(adata_exp0)
# sc.pp.normalize_total(adata_Yao)
adata_exp0.layers["lognorm"] = adata_exp0.X
adata_Yao.layers["lognorm"] = adata_Yao.X

In [4]:
#adata_sp has no x,y coordinates, temporary random:
adata_exp0.obs["x"] = np.random.uniform(0,200,23282)
adata_exp0.obs["y"] = np.random.uniform(0,100,23282)

In [155]:
def relative_expression_similarity_across_genes_local(adata_sp: AnnData, adata_sc: AnnData, bins_x: list[float], bins_y: list[float], key:str='celltype', layer:str='lognorm'):

    min_number_cells = 1
    n_bins_x = len(bins_x) - 1
    n_bins_y = len(bins_y) - 1

    ### SET UP
    # set the .X layer of each of the adatas to be log-normalized counts
    adata_sp.X = adata_sp.layers[layer]
    adata_sc.X = adata_sc.layers[layer]

    # take the intersection of genes in adata_sp and adata_sc, as a list
    intersect_genes = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))
    n_intersect_genes = len(intersect_genes)

    # subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
    adata_sc=adata_sc[:,intersect_genes]
    adata_sp=adata_sp[:,intersect_genes]
            
    # find the intersection of unique celltypes in adata_sc and adata_sp
    intersect_celltypes = list(set(adata_sp.obs["celltype"]).intersection(set(adata_sc.obs["celltype"])))

    adata_sc = adata_sc[adata_sc.obs[key].isin(intersect_celltypes), :]
    adata_sp = adata_sp[adata_sp.obs[key].isin(intersect_celltypes), :]

    # for easier subsetting later
    adata_sp.obs.index = adata_sp.obs.index.str[5:].astype(int)

    # get expression arrays and sum of expression for normalization
    sc_X = adata_sc.X.toarray()
    sp_X = adata_sp.X.toarray()
    sum_sc = np.sum(sc_X)

    # get the adata_sc cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sc=pd.DataFrame(sc_X,columns=adata_sc.var.index)        

    # get the adata_sp cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sp=pd.DataFrame(sp_X,columns=adata_sp.var.index)

    # add "celltype" label column to exp_sc & exp_sp cell x gene matrices 
    exp_sc[key]=list(adata_sc.obs[key])
    exp_sp[key]=list(adata_sp.obs[key])

    exp_sp.index = adata_sp.obs.index

    # get the mean expression per celltype and normalize by sum_sc and n_intersect_genes 
    if sum_sc != 0:
        mean_celltype_sc_normalized=(exp_sc.groupby(key).mean()/sum_sc)*(n_intersect_genes)**2
    else: 
        mean_celltype_sc_normalized=0

    # get pairwise differences of normalized mean expression in sc across gene pairs
    mean_celltype_sc_normalized = mean_celltype_sc_normalized.to_numpy()
    pairwise_diff_sc = mean_celltype_sc_normalized[:,:,np.newaxis] - mean_celltype_sc_normalized[:,np.newaxis,:]

    gridfield_metric = np.zeros((n_bins_y,n_bins_x))

    i, j = 0, 0
    for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
        i = 0
        for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):    
            #hat adata_sp.obs immer x, y Koord? Sonst mit get_cells_location Funktion
            df = adata_sp.obs[["x", "y"]]
            df = df[
            (df["x"] >= x_start)
            & (df["x"] < x_end)
            & (df["y"] >= y_start)
            & (df["y"] < y_end)
        ]

            sp_local = exp_sp.loc[df.index,:]          
            sum_sp_local = np.sum(np.sum(sp_local.iloc[:,:-1]))

            if (any(sp_local[key].value_counts() < min_number_cells)) | (not set(list(exp_sc[key].unique())).issubset(set(list(sp_local[key].unique())))):
                #or better subset sc for existing celltypes and compare? 
                gridfield_metric[n_bins_y-1-i,j] = np.nan
                i += 1
                continue        
                
            # find the mean expression for each gene for each celltype in sc and sp data
            if sum_sp_local != 0:
                mean_celltype_sp_normalized=(sp_local.groupby(key).mean()/sum_sp_local)*(n_intersect_genes)**2
            else: 
                mean_celltype_sc_normalized=0
                
            mean_celltype_sp_normalized=mean_celltype_sp_normalized.to_numpy()
            pairwise_diff_sp = mean_celltype_sp_normalized[:,:,np.newaxis] - mean_celltype_sp_normalized[:,np.newaxis,:]

            delta = np.sum(np.abs(pairwise_diff_sp-pairwise_diff_sc))
            gridfield_metric[n_bins_y-1-i,j]  = 1-delta/(2*np.sum(np.abs(pairwise_diff_sc)))
            i+=1
        j+=1 

    return gridfield_metric
        

In [None]:
relative_expression_similarity_across_genes_local(adata_exp0,adata_Yao,list(range(0,201,40)),list(range(0,101,50)))

In [5]:
adata_sp, adata_sc = adata_exp0, adata_Yao
bins_x, bins_y = list(range(0,201,40)),list(range(0,101,50))
layer = "lognorm"
key = "celltype"

In [None]:
min_total_cells = 100
min_n_cells_per_ct = 10
n_bins_x = len(bins_x) - 1
n_bins_y = len(bins_y) - 1

### SET UP
# set the .X layer of each of the adatas to be log-normalized counts
adata_sp.X = adata_sp.layers[layer]
adata_sc.X = adata_sc.layers[layer]

# take the intersection of genes in adata_sp and adata_sc, as a list
intersect_genes = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))
n_intersect_genes = len(intersect_genes)

# subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
adata_sc=adata_sc[:,intersect_genes]
adata_sp=adata_sp[:,intersect_genes]
        
# find the intersection of unique celltypes in adata_sc and adata_sp
intersect_celltypes = list(set(adata_sp.obs["celltype"]).intersection(set(adata_sc.obs["celltype"])))

adata_sc = adata_sc[adata_sc.obs[key].isin(intersect_celltypes), :]
adata_sp = adata_sp[adata_sp.obs[key].isin(intersect_celltypes), :]

# for easier subsetting later
adata_sp.obs.index = adata_sp.obs.index.str[5:].astype(int)

gridfield_metric = np.zeros((n_bins_y,n_bins_x))

i, j = 0, 0
for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
    i = 0
    for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):    
        #hat adata_sp.obs immer x, y Koord? Sonst mit get_cells_location Funktion
        df = adata_sp.obs[["x", "y"]]
        df = df[
        (df["x"] >= x_start)
        & (df["x"] < x_end)
        & (df["y"] >= y_start)
        & (df["y"] < y_end)
    ]

    if len(df) < min_total_cells:
        gridfield_metric[n_bins_y-1-i,j] = np.nan
        i += 1
        continue  

    sp_local = adata_sp.X[df.index,:] 
    sp_local = pd.DataFrame(sp_local,intersect_genes)
    sp_local[key] = adata_sp.obs[df.index,:][key]

    n_cells_per_ct = sp_local[key].value_counts()
    eligible_ct = n_cells_per_ct[count>min_n_cells_per_ct,:]
    enough_cells = sp_local[key]




In [149]:
def relative_expression_similarity_across_cell_type_clusters(adata_sp: AnnData, adata_sc: AnnData, bins_x: list[float], bins_y: list[float], key:str='celltype', layer:str='lognorm'):
    
    min_number_cells = 1
    n_bins_x = len(bins_x) - 1
    n_bins_y = len(bins_y) - 1

    ### SET UP
    # set the .X layer of each of the adatas to be log-normalized counts
    adata_sp.X = adata_sp.layers[layer]
    adata_sc.X = adata_sc.layers[layer]

    # take the intersection of genes in adata_sp and adata_sc, as a list
    intersect_genes = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))

    # subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
    adata_sc=adata_sc[:,intersect_genes]
    adata_sp=adata_sp[:,intersect_genes]
            
    # find the intersection of unique celltypes in adata_sc and adata_sp
    intersect_celltypes = list(set(adata_sp.obs["celltype"]).intersection(set(adata_sc.obs["celltype"])))
    n_intersect_celltypes = len(intersect_celltypes)

    adata_sc = adata_sc[adata_sc.obs[key].isin(intersect_celltypes), :]
    adata_sp = adata_sp[adata_sp.obs[key].isin(intersect_celltypes), :]

    # for easier subsetting later
    adata_sp.obs.index = adata_sp.obs.index.str[5:].astype(int)

    
    # get expression arrays and sum of expression for normalization
    sc_X = adata_sc.X.toarray()
    sp_X = adata_sp.X.toarray()
    sum_sc = np.sum(sc_X)

    # get the adata_sc cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sc=pd.DataFrame(sc_X,columns=adata_sc.var.index)        

    # get the adata_sp cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sp=pd.DataFrame(sp_X,columns=adata_sp.var.index)

    # add "celltype" label column to exp_sc & exp_sp cell x gene matrices 
    exp_sc[key]=list(adata_sc.obs[key])
    exp_sp[key]=list(adata_sp.obs[key])

    exp_sp.index = adata_sp.obs.index

    # get the mean expression per celltype and normalize by sum_sc and n_intersect_genes 
    if sum_sc != 0:
        mean_celltype_sc_normalized=(exp_sc.groupby(key).mean()/sum_sc)*(n_intersect_celltypes)**2
    else: 
        mean_celltype_sc_normalized=0

    
    # get pairwise differences of normalized mean expression in sc. Now we transpose to get the relative expression across cell type pairs
    mean_celltype_sc_normalized = mean_celltype_sc_normalized.T.to_numpy()
    pairwise_diff_sc = mean_celltype_sc_normalized[:,:,np.newaxis] - mean_celltype_sc_normalized[:,np.newaxis,:]

    gridfield_metric = np.zeros((n_bins_y,n_bins_x))
    
    i, j = 0, 0
    for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
        i = 0
        for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):    
            #hat adata_sp.obs immer x, y Koord? Sonst mit get_cells_location Funktion
            df = adata_sp.obs[["x", "y"]]
            df = df[
            (df["x"] >= x_start)
            & (df["x"] < x_end)
            & (df["y"] >= y_start)
            & (df["y"] < y_end)
        ]

            sp_local = exp_sp.loc[df.index,:]          
            sum_sp_local = np.sum(np.sum(sp_local.iloc[:,:-1]))

            if (any(sp_local[key].value_counts() < min_number_cells)) | (not set(list(exp_sc[key].unique())).issubset(set(list(sp_local[key].unique())))):
                #or better subset sc for existing celltypes and compare? 
                gridfield_metric[n_bins_y-1-i,j] = np.nan
                i += 1
                continue        
                
            # find the mean expression for each gene for each celltype in sc and sp data
            if sum_sc != 0:
                mean_celltype_sp_normalized=(sp_local.groupby(key).mean()/sum_sp_local)*(n_intersect_celltypes)**2
            else: 
                mean_celltype_sc_normalized=0
                
            mean_celltype_sp_normalized=mean_celltype_sp_normalized.T.to_numpy()
            pairwise_diff_sp = mean_celltype_sp_normalized[:,:,np.newaxis] - mean_celltype_sp_normalized[:,np.newaxis,:]

            delta = np.sum(np.abs(pairwise_diff_sp-pairwise_diff_sc))
            gridfield_metric[n_bins_y-1-i,j]  = 1-delta/(2*np.sum(np.abs(pairwise_diff_sc)))
            i+=1
        j+=1 

    return gridfield_metric

    

In [150]:
relative_expression_similarity_across_cell_type_clusters(adata_exp0,adata_Yao,list(range(0,201,40)),list(range(0,101,50)))

array([[-2.61672153, -2.39644769, -2.48909325, -2.3328498 , -2.21449466],
       [-2.41956707, -2.19773937, -2.43271305, -2.27578325, -2.6306601 ]])

In [None]:
layer = "lognorm"
key = "celltype"
adata_sp, adata_sc = adata_exp0, adata_Yao
bins_x, bins_y = list(range(0,201,20)),list(range(1,101,20))
min_number_cells = 10 
n_bins_x = len(bins_x) - 1
n_bins_y = len(bins_y) - 1

### SET UP
# set the .X layer of each of the adatas to be log-normalized counts
adata_sp.X = adata_sp.layers[layer]
adata_sc.X = adata_sc.layers[layer]

# take the intersection of genes in adata_sp and adata_sc, as a list
intersect_genes = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))
n_intersect_genes = len(intersect_genes)

# subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
adata_sc=adata_sc[:,intersect_genes]
adata_sp=adata_sp[:,intersect_genes]
        
# find the intersection of unique celltypes in adata_sc and adata_sp
intersect_celltypes = list(set(adata_sp.obs["celltype"]).intersection(set(adata_sc.obs["celltype"])))

adata_sc = adata_sc[adata_sc.obs[key].isin(intersect_celltypes), :]
adata_sp = adata_sp[adata_sp.obs[key].isin(intersect_celltypes), :]

# for easier subsetting later
adata_sp.obs.index = adata_sp.obs.index.str[5:].astype(int)

# get expression arrays and sum of expression for normalization
sc_X = adata_sc.X.toarray()
sp_X = adata_sp.X.toarray()
sum_sc = np.sum(sc_X)

# get the adata_sc cell x gene matrix as a pandas dataframe (w gene names as column names)
exp_sc=pd.DataFrame(sc_X,columns=adata_sc.var.index)        

# get the adata_sp cell x gene matrix as a pandas dataframe (w gene names as column names)
exp_sp=pd.DataFrame(sp_X,columns=adata_sp.var.index)

# add "celltype" label column to exp_sc & exp_sp cell x gene matrices 
exp_sc[key]=list(adata_sc.obs[key])
exp_sp[key]=list(adata_sp.obs[key])

exp_sp.index = adata_sp.obs.index

In [None]:
# get the mean expression per celltype and normalize by sum_sc and n_intersect_genes 
if sum_sc != 0:
    mean_celltype_sc_normalized=(exp_sc.groupby(key).mean()/sum_sc)*n_intersect_genes
else: 
    mean_celltype_sc_normalized=0

In [None]:
# get pairwise differences of normalized mean expression of sc
mean_celltype_sc_normalized = mean_celltype_sc_normalized.to_numpy()
pairwise_diff_sc = mean_celltype_sc_normalized[:,:,np.newaxis] - mean_celltype_sc_normalized[:,np.newaxis,:]

In [None]:
gridfield_metric = np.zeros((n_bins_y,n_bins_x))
np.shape(gridfield_metric)

In [None]:
i, j = 0, 0
for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
    i = 0
    for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):    
        #hat adata_sp.obs immer x, y Koord? Sonst mit get_cells_location Funktion
        df = adata_sp.obs[["x", "y"]]
        df = df[
          (df["x"] >= x_start)
        & (df["x"] < x_end)
        & (df["y"] >= y_start)
        & (df["y"] < y_end)
    ]

        sp_local = exp_sp.loc[df.index,:]          
        sum_sp_local = np.sum(np.sum(sp_local.iloc[:,:-1]))

        if (any(sp_local[key].value_counts() < 1)) | (not set(list(exp_sc[key].unique())).issubset(set(list(sp_local[key].unique())))):
            #or better subset sc for existing celltypes and compare? 
            gridfield_metric[n_bins_y-1-i,j] = np.nan
            i += 1
            continue        
            
        # find the mean expression for each gene for each celltype in sc and sp data
        mean_celltype_sp_normalized=sp_local.groupby(key).mean()/sum_sp_local
        mean_celltype_sp_normalized=mean_celltype_sp_normalized.to_numpy()
        pairwise_diff_sp = mean_celltype_sp_normalized[:,:,np.newaxis] - mean_celltype_sp_normalized[:,np.newaxis,:]

        delta = np.sum(np.abs(pairwise_diff_sp-pairwise_diff_sc))
        gridfield_metric[n_bins_y-1-i,j]  = 1-delta/(2*np.sum(np.abs(pairwise_diff_sc)))
        i+=1
    j+=1 


In [None]:
gridfield_metric

In [None]:
sp_local["celltype"].unique()

In [None]:
set(list(exp_sc["celltype"].unique()))

In [None]:
set(list(exp_sc["celltype"].unique())).issubset(set(list(sp_local["celltype"].unique())))

In [144]:
a = np.array([[1,2],[0,2],[0,3]])
a[:,:,np.newaxis] - a[:,np.newaxis,:]


array([[[ 0, -1],
        [ 1,  0]],

       [[ 0, -2],
        [ 2,  0]],

       [[ 0, -3],
        [ 3,  0]]])

In [None]:
adata = ad.AnnData(np.array([[0,1],[1,2],[5,4],[0,2]]))
adata

In [None]:
#test

t = pd.DataFrame(np.array([[0,0,0],[1,1,1],[2,0,1],[3,1,3]]), columns=["x","y","score"])
adata.obs = t
bins_x = [0,2,3,4]
bins_y = [0,2]
# t.loc[(t["x"]>=bins_x[0])&(t["x"]<bins_x[1])&(t["y"]>=bins_y[0])&(t["y"]<=bins_y[1])]
t["test"] = 0
k = 0
            
for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
    for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):
        df = t[
            (t["x"] >= x_start)
            & (t["x"] < x_end)
            & (t["y"] >= y_start)
            & (t["y"] < y_end)
        ]
        print(x_start, x_end)
        print(y_start, y_end)
        print(df.index)
