In [1]:
import scanpy as sc
import scvelo as scv
import numpy as np
from sklearn.neighbors import kneighbors_graph
from scipy.sparse.csgraph import dijkstra
import pickle
from utils import color_keys

# SEPARATE SMOOTHING

In [2]:
gastrluation = scv.datasets.gastrulation_erythroid()
pancreas = scv.datasets.pancreas()

In [3]:
import scvelo as scv

# Load the datasets
gastrulation = scv.datasets.gastrulation_erythroid()
pancreas = scv.datasets.pancreas()

# Find the common gene names
common_genes = gastrulation.var_names.intersection(pancreas.var_names)

# Filter both datasets to include only the common genes
gastrulation_common = gastrulation[:, common_genes]
pancreas_common = pancreas[:, common_genes]

# Optionally concatenate the datasets along cells
# Note: Ensure `obs` keys are consistent between datasets before concatenating.
adata = gastrulation_common.concatenate(
    pancreas_common,
    batch_categories=["gastrulation", "pancreas"]
)

# Add the new category to the 'stage' column
adata.obs["stage"] = adata.obs["stage"].cat.add_categories("P_Day15")

# Now assign the new category
adata.obs.loc[adata.obs["batch"] == "pancreas", "stage"] = "P_Day15"
# Create a new column 'combined_celltype' in the merged anndata
adata.obs['combined_celltype'] = adata.obs['celltype'].astype(str)

# Update the 'combined_celltype' column with the 'clusters' information for pancreas cells
adata.obs.loc[adata.obs['batch'] == 'pancreas', 'combined_celltype'] = adata.obs.loc[adata.obs['batch'] == 'pancreas', 'clusters']

# Verify the changes
adata.obs['combined_celltype']


AAAGATCTCTCGAA-gastrulation    Blood progenitors 2
AATCTCACTGCTTT-gastrulation    Blood progenitors 2
AATGGCTGAAGATG-gastrulation    Blood progenitors 2
ACACATCTGTCAAC-gastrulation    Blood progenitors 2
ACGACAACTGGAGG-gastrulation    Blood progenitors 2
                                      ...         
TTTGTCAAGTGACATA-pancreas            Pre-endocrine
TTTGTCAAGTGTGGCA-pancreas             Ngn3 high EP
TTTGTCAGTTGTTTGG-pancreas                   Ductal
TTTGTCATCGAATGCT-pancreas                    Alpha
TTTGTCATCTGTTTGT-pancreas                  Epsilon
Name: combined_celltype, Length: 13511, dtype: object

In [4]:
num_genes = 2000
stage_list = ["E7.5", "E7.75", "E8.0", "E8.25", "E8.5", "P_Day15"]
idxs = adata.obs["stage"].isin(stage_list)
adata = adata[idxs].copy()
adata = color_keys(adata, "combined_celltype")
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=num_genes)
index_mapping = {index: i for i, index in enumerate(adata.obs.index)}
adata.obs['numerical_index'] = adata.obs.index.map(index_mapping)
adata.layers["Mu"] = np.zeros_like(adata.layers["unspliced"].toarray())
adata.layers["Ms"] = np.zeros_like(adata.layers["spliced"].toarray())
neighbors = 40
indices = np.zeros((adata.shape[0], neighbors+1))

neighbors_indices_dic = {}
for stage in stage_list:
    print(f"Processing stage {stage}")
    adata_tmp = adata[adata.obs["stage"] == stage].copy()
    scv.pp.moments(adata_tmp, n_neighbors=200)
    obs_idx = adata[adata.obs["stage"] == stage].obs.index
    num_idx = adata[adata.obs["stage"] == stage].obs["numerical_index"].values
    adata.layers["Mu"][num_idx] = adata_tmp[obs_idx].layers["Mu"].copy()
    adata.layers["Ms"][num_idx] = adata_tmp[obs_idx].layers["Ms"].copy()
    # Use spliced matrix for Isomap embedding
    u = adata_tmp.layers["unspliced"].toarray()
    s = adata_tmp.layers["spliced"].toarray()
    u_s = np.concatenate([u, s], axis=1)
    # Step 1: Create a nearest-neighbor graph (Euclidean distances for the graph construction)
    knn_graph = kneighbors_graph(u_s, n_neighbors=neighbors, mode='distance', include_self=False)

    # Step 2: Compute the geodesic distances using Dijkstra's algorithm
    # This returns the shortest paths between all pairs of points
    geodesic_distances = dijkstra(csgraph=knn_graph, directed=False, return_predecessors=False)

    # Step 3: Build the neighbor matrix
    # For each cell, find its nearest neighbors based on geodesic distances
    nearest_neighbors = []
    for i in range(geodesic_distances.shape[0]):
        sorted_indices = np.argsort(geodesic_distances[i])
        nearest_neighbors.append(sorted_indices[:neighbors + 1])  # Include the cell itself

    local_nearest_neighbor_matrix = np.array(nearest_neighbors)
    
    # Step 4: Map local indices to numerical indices
    numerical_indices = adata_tmp.obs["numerical_index"].values
    nearest_neighbors_matrix = numerical_indices[local_nearest_neighbor_matrix]
    
    indices[num_idx,:] = nearest_neighbors_matrix

adata.uns["indices"] = np.array(indices, dtype=int)

adata.write_h5ad("erythroid_pancreas.h5ad")

Filtered out 18360 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)


Processing stage E7.5
computing neighbors
    finished (0:00:03) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing stage E7.75
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing stage E8.0
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing stage E8.25
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connect