In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
from anndata import AnnData
from scipy.sparse import issparse
import anndata as ad

In [13]:
adata_exp0 = ad.read_h5ad('C:/Users/mdichgan/Documents/Helmholtz/send_to_Jakob/spatial/counts_CPc_exp0_BA28.h5ad')
adata_Yao = ad.read_h5ad(
    'C:/Users/mdichgan/Documents/Helmholtz/send_to_Jakob/sc/Yao_150kcells_subsample_with_annotations_sparse_subset.h5ad')

In [6]:
adata_Yao.obs["celltype"] = adata_Yao.obs["label"]
adata_exp0.layers["raw"] = adata_exp0.X
adata_Yao.layers["raw"] = adata_Yao.X

# sc.pp.normalize_total(adata_exp0)
# sc.pp.normalize_total(adata_Yao)
adata_exp0.layers["lognorm"] = adata_exp0.X
adata_Yao.layers["lognorm"] = adata_Yao.X

In [None]:
def relative_pairwise_celltype_expression(adata_sp: AnnData, adata_sc: AnnData, key:str='celltype', layer:str='lognorm',  pipeline_output: bool=True):
    """Calculate the efficiency deviation present between the genes in the panel. 
    ----------
    adata_sp : AnnData
        annotated ``AnnData`` object with counts from spatial data
    adata_sc : AnnData
        annotated ``AnnData`` object with counts from scRNAseq data
    key: str (default: 'celltype')
        .obs column of ``AnnData`` that contains celltype information
    layer: str (default: 'lognorm')
        layer of ```AnnData`` to use to compute the metric
    pipeline_output: bool (default: True)
        whether to return only the overall metric (if False, will return the overall metric, per-gene metric and per-celltype metric) 

    Returns
    -------
    overall_metric: float
        similarity of relative gene expression across all genes and celltypes, b/t the scRNAseq and spatial data
    per_gene_metric: float
        similarity of relative gene expression per gene across all celltypes, b/t the scRNAseq and spatial data
    per_celltype_metric: float
        similarity of relative gene expression per celltype across all genes, b/t the scRNAseq and spatial data
  
    """   
    ### SET UP
    # set the .X layer of each of the adatas to be log-normalized counts
    adata_sp.X = adata_sp.layers[layer]
    adata_sc.X = adata_sc.layers[layer]
    
    # take the intersection of genes in adata_sp and adata_sc, as a list
    intersect = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))
    
    # subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
    adata_sc=adata_sc[:,intersect]
    adata_sp=adata_sp[:,intersect]
    
    # sparse matrix support
    for a in [adata_sc, adata_sp]:
        if issparse(a.X):
            a.X = a.X.toarray()
            
    # find the unique celltypes in adata_sc that are also in adata_sp
    unique_celltypes=adata_sc.obs.loc[adata_sc.obs[key].isin(adata_sp.obs[key]),key].unique()
    
    
    #### FIND MEAN GENE EXPRESSION PER CELL TYPE FOR EACH MODALITY
    # get the adata_sc cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sc=pd.DataFrame(adata_sc.X,columns=adata_sc.var.index)        #ValueError: Shape of passed values is (24509, 1), indices imply (24509, 283)
    
    # get the adata_sp cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sp=pd.DataFrame(adata_sp.X,columns=adata_sp.var.index)
    
    # add "celltype" label column to exp_sc & exp_sp cell x gene matrices 
    exp_sc[key]=list(adata_sc.obs[key])
    exp_sp[key]=list(adata_sp.obs[key])
    
    # delete all cells from the exp matrices if they aren't in the set of intersecting celltypes b/t sc & sp data
    exp_sc=exp_sc.loc[exp_sc[key].isin(unique_celltypes),:]
    exp_sp=exp_sp.loc[exp_sp[key].isin(unique_celltypes),:]
    
    # find the mean expression for each gene for each celltype in sc and sp data
    mean_celltype_sp=exp_sp.groupby(key).mean()
    mean_celltype_sc=exp_sc.groupby(key).mean()
    
    # sort genes in alphabetical order 
    mean_celltype_sc=mean_celltype_sc.loc[:,mean_celltype_sc.columns.sort_values()]
    mean_celltype_sp=mean_celltype_sp.loc[:,mean_celltype_sp.columns.sort_values()]
    
    
    #### CALCULATE PAIRWISE RELATIVE DISTANCES BETWEEN CELL TYPES
    mean_celltype_sc_np = mean_celltype_sc.T.to_numpy()             
    pairwise_distances_sc = mean_celltype_sc_np[:,:,np.newaxis] - mean_celltype_sc_np[:,np.newaxis,:]
    pairwise_distances_sc = pairwise_distances_sc.transpose((1,2,0)) #results in np.array of dimensions (num_celltypes, num_celltypes, num_genes) 
       
    mean_celltype_sp_np = mean_celltype_sp.T.to_numpy()
    pairwise_distances_sp = mean_celltype_sp_np[:,:,np.newaxis] - mean_celltype_sp_np[:,np.newaxis,:]
    pairwise_distances_sp = pairwise_distances_sp.transpose((1,2,0)) #results in np.array of dimensions (num_celltypes,num_celltypes, num_genes) 
    
    #### NORMALIZE THESE PAIRWISE DISTANCES BETWEEN CELL TYPES
    #calculate sum of absolute distances
    abs_diff_sc = np.absolute(pairwise_distances_sc)
    abs_diff_sum_sc = np.sum(abs_diff_sc, axis=(0,1))
    
    abs_diff_sp = np.absolute(pairwise_distances_sp)
    abs_diff_sum_sp = np.sum(abs_diff_sp, axis=(0,1))
    
    norm_factor_sc = (1/(mean_celltype_sc.T.shape[1]**2)) * abs_diff_sum_sc
    norm_factor_sp = (1/(mean_celltype_sp.T.shape[1]**2)) * abs_diff_sum_sp
    
    
    #perform normalization
    norm_pairwise_distances_sc = np.divide(pairwise_distances_sc, norm_factor_sc)
    norm_pairwise_distances_sp = np.divide(pairwise_distances_sp, norm_factor_sp)
    
    
    pairwise_distances_sc[:,:,norm_factor_sc!=0] = np.divide(pairwise_distances_sc[:,:,norm_factor_sc!=0], 
                                                             norm_factor_sc[norm_factor_sc!=0])
    # exclude the ones with norm_factor_sc, norm_factor_sp with zero
    pairwise_distances_sp[:,:,norm_factor_sp!=0] = np.divide(pairwise_distances_sp[:,:,norm_factor_sp!=0], 
                                                             norm_factor_sp[norm_factor_sp!=0])
    norm_pairwise_distances_sc = pairwise_distances_sc
    norm_pairwise_distances_sp = pairwise_distances_sp
    
    
    ##### CALCULATE OVERALL SCORE,PER-GENE SCORES, PER-CELLTYPE SCORES
    overall_score = np.sum(np.absolute(norm_pairwise_distances_sp - norm_pairwise_distances_sc), axis=None)
    overall_metric = 1 - (overall_score/(2 * np.sum(np.absolute(norm_pairwise_distances_sc), axis=None)))
    
    per_gene_score = np.sum(np.absolute(norm_pairwise_distances_sp - norm_pairwise_distances_sc), axis=(0,1))
    per_gene_metric = 1 - (per_gene_score/(2 * np.sum(np.absolute(norm_pairwise_distances_sc), axis=(0,1))))
    per_gene_metric = pd.DataFrame(per_gene_metric, index=mean_celltype_sc.columns, columns=['score']) #add back the gene labels 


    #per_gene_metric = pd.DataFrame(per_gene_metric, index=mean_celltype_sc.T.columns, columns=['score']) #add back the gene labels 
    
    per_celltype_score = np.sum(np.absolute(norm_pairwise_distances_sp - norm_pairwise_distances_sc), axis=(1,2))
    per_celltype_metric = 1 - (per_celltype_score/(2 * np.sum(np.absolute(norm_pairwise_distances_sc), axis=(1,2))))
    per_celltype_metric = pd.DataFrame(per_celltype_metric, index=mean_celltype_sc.index, columns=['score']) #add back the celltype labels 
    
    if pipeline_output:
        return overall_metric
    
    return overall_metric, per_gene_metric, per_celltype_metric


In [None]:
relative_pairwise_celltype_expression(adata_exp0, adata_Yao)

In [2]:
def relative_pairwise_celltype_expression_local(adata_sp: AnnData, adata_sc: AnnData, bins_x: list[float], bins_y: list[float], key:str='celltype', layer:str='lognorm'):
    
    
    min_number_cells = 10 
    n_bins_x = len(bins_x) - 1
    n_bins_y = len(bins_y) - 1
    
    ### SET UP
    # set the .X layer of each of the adatas to be log-normalized counts
    adata_sp.X = adata_sp.layers[layer]
    adata_sc.X = adata_sc.layers[layer]
    
    # take the intersection of genes in adata_sp and adata_sc, as a list
    intersect_genes = list(set(adata_sp.var_names).intersection(set(adata_sc.var_names)))
    
    # subset adata_sc and adata_sp to only include genes in the intersection of adata_sp and adata_sc 
    adata_sc=adata_sc[:,intersect_genes]
    adata_sp=adata_sp[:,intersect_genes]
    
    # sparse matrix support
    for a in [adata_sc, adata_sp]:
        if issparse(a.X):
            a.X = a.X.toarray()
            
    # find the unique celltypes in adata_sc that are also in adata_sp
    interesect_celltypes = list(set(adata_sp.obs["celltype"]).intersection(set(adata_sc.obs["celltype"])))

    #### FIND MEAN GENE EXPRESSION PER CELL TYPE FOR EACH MODALITY
    # get the adata_sc cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sc=pd.DataFrame(adata_sc.X,columns=adata_sc.var.index)        #ValueError: Shape of passed values is (24509, 1), indices imply (24509, 283)
    
    # get the adata_sp cell x gene matrix as a pandas dataframe (w gene names as column names)
    exp_sp=pd.DataFrame(adata_sp.X,columns=adata_sp.var.index)
    
    # add "celltype" label column to exp_sc & exp_sp cell x gene matrices 
    exp_sc[key]=list(adata_sc.obs[key])
    exp_sp[key]=list(adata_sp.obs[key])
    
    # delete all cells from the exp matrices if they aren't in the set of intersecting celltypes b/t sc & sp data
    exp_sc=exp_sc.loc[exp_sc[key].isin(interesect_celltypes),:]
    exp_sp=exp_sp.loc[exp_sp[key].isin(interesect_celltypes),:]

    #unnötig?:
    exp_sc=exp_sc[:,exp_sc.columns.sort_values()]
    exp_sp=exp_sp[:,exp_sp.columns.sort_values()]

    sum_sc = sum(exp_sc)
    mean_celltype_sc_normalized=exp_sc.groupby(key).mean()/sum_sc

    mean_celltype_sc_normalized = mean_celltype_sc_normalized.T.to_numpy()
    pairwise_diff_sc = mean_celltype_sc_normalized[:,:,np.newaxis] - mean_celltype_sc_normalized[:,np.newaxis,:]

    gridfield_metric = np.zeros((n_bins_x,n_bins_y))
    i, j = 0, 0
        
    for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
        for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):    
            #hat adata_sp.obs immer x, y Koord? Sonst mit get_cells_location Funktion
            df = adata_sp.obs[["x", "y"]]
            df = df[
            (df["x"] >= x_start)
            & (df["x"] < x_end)
            & (df["y"] >= y_start)
            & (df["y"] < y_end)
        ]

        sp_local = exp_sp.loc[df.index,:]
        sum_sp_local = sum(sp_local)

        if any(sp_local[key].value_counts() < min_number_cells):
            gridfield_metric[n_bins_y-1-i,j] 
            i += 1
            break        
            
        # find the mean expression for each gene for each celltype in sc and sp data
        mean_celltype_sp_normalized=sp_local.groupby(key).mean()/sum_sp_local
        mean_celltype_sp_normalized=mean_celltype_sp_normalized.T.to_numpy()
        pairwise_diff_sp = mean_celltype_sp_normalized[:,:,np.newaxis] - mean_celltype_sp_normalized[:,np.newaxis,:]

        delta = np.sum(np.abs(pairwise_diff_sp-pairwise_diff_sc))
        gridfield_metric[n_bins_y-1-i,j]  = 1-delta/(2*np.sum(np.abs(pairwise_diff_sc)))
        i+=1
    j+=1 

    return gridfield_metric
    
 

In [2]:
a = np.array([[1,2],[0,2],[0,3]])
a[:,:,np.newaxis] - a[:,np.newaxis,:]


array([[[ 0, -1],
        [ 1,  0]],

       [[ 0, -2],
        [ 2,  0]],

       [[ 0, -3],
        [ 3,  0]]])

In [4]:
adata = ad.AnnData(np.array([[0,1],[1,2],[5,4],[0,2]]))
adata

AnnData object with n_obs × n_vars = 4 × 2

In [5]:
#test

t = pd.DataFrame(np.array([[0,0,0],[1,1,1],[2,0,1],[3,1,3]]), columns=["x","y","score"])
adata.obs = t
bins_x = [0,2,3,4]
bins_y = [0,2]
# t.loc[(t["x"]>=bins_x[0])&(t["x"]<bins_x[1])&(t["y"]>=bins_y[0])&(t["y"]<=bins_y[1])]
t["test"] = 0
k = 0
            
for x_start, x_end in zip(bins_x[:-1], bins_x[1:]):
    for y_start, y_end in zip(bins_y[:-1], bins_y[1:]):
        df = t[
            (t["x"] >= x_start)
            & (t["x"] < x_end)
            & (t["y"] >= y_start)
            & (t["y"] < y_end)
        ]
        print(x_start, x_end)
        print(y_start, y_end)
        print(df.index)


0 2
0 2
Int64Index([0, 1], dtype='int64')
2 3
0 2
Int64Index([2], dtype='int64')
3 4
0 2
Int64Index([3], dtype='int64')


In [12]:
t.loc[df.index,:]

Unnamed: 0,x,y,score,test
3,3,1,3,0


In [None]:
#test data:
# T = adata_exp0[adata_exp0.obs["cell_id"].isin([1855,21448,1464])].copy()

adata_Yao.obs.loc[adata_Yao.obs["celltype"] == "Pvalb"]
adata_exp0.obs.loc[adata_exp0.obs["celltype"] == "Pvalb"]


In [None]:
A = np.array([[1,1],[1,0]])
B = np.array([[1,1,1],[1,0,1],[1,1,1],[1,1,1]])
test_sp = ad.AnnData(A)
test_sc = ad.AnnData(B)
test_sp.obs_names = [f"Cell_sp_{i:d}" for i in range(test_sp.n_obs)]
test_sp.var_names = [f"Gene_{i:d}" for i in range(test_sp.n_vars)]
test_sc.obs_names = [f"Cell_sc_{i:d}" for i in range(test_sc.n_obs)]
test_sc.var_names = [f"Gene_{i:d}" for i in range(test_sc.n_vars)]

In [None]:
import mygrad as mg

In [None]:
x = mg.tensor([1.,2.,3.])
f = np.sum(x*x)
f.backward()
x.grad

In [None]:
def f(x,y):
    return x**2+y**2

In [None]:
x = mg.tensor(1)
y = mg.tensor(2)
f.backward()

In [None]:
x = mg.tensor(5)
fx = x**2
fx.backward()
x.grad

In [None]:
import autograd as ag
# from autograd.variable import Variable        #existiert nicht?

In [None]:
ag.set_mode('forward')
x = Variable(1)
b1 = x + 3
b2 = ad.sin(x)
b3 = ad.cos(b1)
b4 = ad.sin(b3)
b5 = b2*b2
b6 = ad.exp(b5)
b7 = b6 + b4

In [21]:
import sympy as sym
from sympy.tensor.array import derive_by_array

In [25]:
x, y = sym.symbols('x y')
f = x**2+y**2
g = derive_by_array(f, (x,y))
type(g)

sympy.tensor.array.dense_ndim_array.ImmutableDenseNDimArray

In [29]:
# Define the variables
x, y = sym.symbols('x y')

# Define the function
f = x**2 + y**2

# Compute the gradient of f
gradient = [f.diff(var) for var in (x, y)]

# Evaluate the gradient at (1, 2)
gradient_at_point = [grad.subs([(x, 1), (y, 2)]) for grad in gradient]
gradient_at_point

[2, 4]