In [1]:
import scanpy as sc
import scvelo as scv
import numpy as np
from sklearn.neighbors import kneighbors_graph
from scipy.sparse.csgraph import dijkstra
import pickle
from utils import color_keys

# SEPARATE SMOOTHING

In [2]:
adata = sc.read_h5ad("gastrulation_unprocessed.h5ad")

In [None]:
adata.obs

In [None]:
adata.obs["sample"], adata.obs["stage"], adata.obs["sequencing.batch"]
# Check if each sample corresponds to exactly one stage and vice versa
sample_stage_mapping = adata.obs.groupby("sample")["stage"].nunique()
stage_sample_mapping = adata.obs.groupby("stage")["sample"].nunique()

# Verify the mappings
all_samples_one_stage = (sample_stage_mapping == 1).all()
all_stages_one_sample = (stage_sample_mapping == 1).all()

# Check if each stage corresponds to exactly one sequencing.batch and vice versa
stage_batch_mapping = adata.obs.groupby("stage")["sequencing.batch"].nunique()
batch_stage_mapping = adata.obs.groupby("sequencing.batch")["stage"].nunique()

# Verify the mappings
all_stages_one_batch = (stage_batch_mapping == 1).all()
all_batches_one_stage = (batch_stage_mapping == 1).all()

all_samples_one_stage, all_stages_one_sample, all_stages_one_batch, all_batches_one_stage
batch_counts = adata.obs["sequencing.batch"].value_counts()
batch_counts

In [None]:
stage_counts = adata.obs["stage"].value_counts()
stage_counts

In [None]:
import pandas as pd

# Assuming the dataset is loaded into a pandas DataFrame called `df`
# Example DataFrame columns: barcode, sample, stage, sequencing.batch, ...

# Function to check if each sample corresponds to one sequencing.batch and one stage
def check_sample_consistency(dataframe):
    # Group by 'sample' and check unique 'sequencing.batch' and 'stage' counts
    consistency_check = dataframe.groupby('sample').agg(
        unique_batches=('sequencing.batch', 'nunique'),
        unique_stages=('stage', 'nunique')
    )
    
    # Identify samples that fail the one-to-one mapping rule
    inconsistent_samples = consistency_check[
        (consistency_check['unique_batches'] > 1) | (consistency_check['unique_stages'] > 1)
    ]
    
    return inconsistent_samples

# Example: Load your dataset into `df` before running the function
inconsistent_samples = check_sample_consistency(adata.obs)
inconsistent_samples

In [None]:
sample_counts = adata.obs["sample"].value_counts()
sample_counts

In [21]:
adata.obs["sequencing.batch"] = adata.obs["sequencing.batch"].astype('category')

In [None]:
sc.pl.umap(adata, color=["stage", "sequencing.batch"])


In [None]:
obs_tmp = (adata.obs["sequencing.batch"] == 1) & (adata.obs["haem_subclust"].notna())
obs_tmp = (adata.obs["haem_subclust"].notna())
adata_tmp = adata[obs_tmp, :].copy()
sc.pp.neighbors(adata_tmp)
sc.tl.umap(adata_tmp)
keys = ["celltype","stage", "sample", "sequencing.batch"]
sc.pl.umap(adata_tmp, color=["stage", "sequencing.batch"])


In [None]:
obs_tmp = (adata.obs["sequencing.batch"] == 1) & (adata.obs["haem_subclust"].notna())
#obs_tmp = (adata.obs["haem_subclust"].notna())
adata_tmp = adata[obs_tmp, :].copy()
sc.pp.neighbors(adata_tmp)
sc.tl.umap(adata_tmp)
keys = ["celltype","stage", "sample", "sequencing.batch"]
sc.pl.umap(adata_tmp, color=["stage", "sequencing.batch"])


In [None]:
sc.pl.umap(adata, color="sequencing.batch", title="Sequencing batch", save="_sequencing_batch.pdf")

In [None]:
#keys = ["celltype","stage", "sample", "sequencing.batch"]
keys = ["celltype","stage", "sample", "sequencing.batch"]
sc.pl.umap(adata_tmp, color=keys)

In [None]:
sc.pl.umap(adata, color="celltype")

In [None]:
sample

In [None]:
adata[adata.obs["stage"] == str(sample),]

In [None]:
adata_tmp

In [2]:
num_genes = 2000
cell_type_key = "celltype"
adata = sc.read_h5ad("gastrulation_unprocessed.h5ad")
sample_list = adata.obs["sample"].unique()
idxs = adata.obs["sample"].isin(sample_list)
adata = adata[idxs].copy()
adata = color_keys(adata, cell_type_key)
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=num_genes)
index_mapping = {index: i for i, index in enumerate(adata.obs.index)}
adata.obs['numerical_index'] = adata.obs.index.map(index_mapping)
adata.layers["Mu"] = np.zeros_like(adata.layers["unspliced"].toarray())
adata.layers["Ms"] = np.zeros_like(adata.layers["spliced"].toarray())
neighbors = 15
indices = np.zeros((adata.shape[0], neighbors+1))

neighbors_indices_dic = {}
for sample in sample_list:
    print(f"Processing sample {sample}")
    adata_tmp = adata[adata.obs["sample"] == sample].copy()
    scv.pp.moments(adata_tmp, n_neighbors=200)
    obs_idx = adata[adata.obs["sample"] == sample].obs.index
    num_idx = adata[adata.obs["sample"] == sample].obs["numerical_index"].values
    adata.layers["Mu"][num_idx] = adata_tmp[obs_idx].layers["Mu"].copy()
    adata.layers["Ms"][num_idx] = adata_tmp[obs_idx].layers["Ms"].copy()
    # Use spliced matrix for Isomap embedding
    u = adata_tmp.layers["unspliced"].toarray()
    s = adata_tmp.layers["spliced"].toarray()
    u_s = np.concatenate([u, s], axis=1)
    # Step 1: Create a nearest-neighbor graph (Euclidean distances for the graph construction)
    knn_graph = kneighbors_graph(u_s, n_neighbors=neighbors, mode='distance', include_self=False)

    # Step 2: Compute the geodesic distances using Dijkstra's algorithm
    # This returns the shortest paths between all pairs of points
    geodesic_distances = dijkstra(csgraph=knn_graph, directed=False, return_predecessors=False)

    # Step 3: Build the neighbor matrix
    # For each cell, find its nearest neighbors based on geodesic distances
    nearest_neighbors = []
    for i in range(geodesic_distances.shape[0]):
        sorted_indices = np.argsort(geodesic_distances[i])
        nearest_neighbors.append(sorted_indices[:neighbors + 1])  # Include the cell itself

    local_nearest_neighbor_matrix = np.array(nearest_neighbors)
    
    # Step 4: Map local indices to numerical indices
    numerical_indices = adata_tmp.obs["numerical_index"].values
    nearest_neighbors_matrix = numerical_indices[local_nearest_neighbor_matrix]
    
    indices[num_idx,:] = nearest_neighbors_matrix

adata.uns["indices"] = np.array(indices, dtype=int)

adata.write_h5ad("gastrulation_processed.h5ad")

Filtered out 43035 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.


  log1p(adata)


Logarithmized X.
Processing sample 1
computing neighbors
    finished (0:00:05) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing sample 2
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing sample 3
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processing sample 4
computing neighbors
    finished (0:00:00) --> added 
    'distances' and '

# COMMON SMOOTHING

In [None]:
num_genes = 2000
adata = scv.datasets.gastrulation_erythroid()
stage_list = ["E7.5", "E7.75", "E8.0", "E8.25", "E8.5"]
idxs = adata.obs["stage"].isin(stage_list)
adata = adata[idxs].copy()
adata = color_keys(adata)
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=num_genes)
scv.pp.moments(adata, n_neighbors=200)
index_mapping = {index: i for i, index in enumerate(adata.obs.index)}
adata.obs['numerical_index'] = adata.obs.index.map(index_mapping)
neighbors = 40
indices = np.zeros((adata.shape[0], neighbors+1))

neighbors_indices_dic = {}
for stage in stage_list:
    print(f"Processing stage {stage}")
    adata_tmp = adata[adata.obs["stage"] == stage].copy()
    obs_idx = adata[adata.obs["stage"] == stage].obs.index
    num_idx = adata[adata.obs["stage"] == stage].obs["numerical_index"].values
    # Use spliced matrix for Isomap embedding
    u = adata_tmp.layers["unspliced"].toarray()
    s = adata_tmp.layers["spliced"].toarray()
    u_s = np.concatenate([u, s], axis=1)
    # Step 1: Create a nearest-neighbor graph (Euclidean distances for the graph construction)
    knn_graph = kneighbors_graph(u_s, n_neighbors=neighbors, mode='distance', include_self=False)

    # Step 2: Compute the geodesic distances using Dijkstra's algorithm
    # This returns the shortest paths between all pairs of points
    geodesic_distances = dijkstra(csgraph=knn_graph, directed=False, return_predecessors=False)

    # Step 3: Build the neighbor matrix
    # For each cell, find its nearest neighbors based on geodesic distances
    nearest_neighbors = []
    for i in range(geodesic_distances.shape[0]):
        sorted_indices = np.argsort(geodesic_distances[i])
        nearest_neighbors.append(sorted_indices[:neighbors + 1])  # Include the cell itself

    local_nearest_neighbor_matrix = np.array(nearest_neighbors)
    
    # Step 4: Map local indices to numerical indices
    numerical_indices = adata_tmp.obs["numerical_index"].values
    nearest_neighbors_matrix = numerical_indices[local_nearest_neighbor_matrix]
    
    indices[num_idx,:] = nearest_neighbors_matrix

adata.uns["indices"] = np.array(indices, dtype=int)

adata.write_h5ad("gastrulation_erythroid_common_smoothing.h5ad")

# common geodesic

In [None]:
num_genes = 2000
cell_type_key = "celltype"
adata = scv.datasets.gastrulation_erythroid()
stage_list = ["E7.5", "E7.75", "E8.0", "E8.25", "E8.5"]
idxs = adata.obs["stage"].isin(stage_list)
adata = adata[idxs].copy()
adata = color_keys(adata, cell_type_key)
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=num_genes)
scv.pp.moments(adata, n_neighbors=200)
index_mapping = {index: i for i, index in enumerate(adata.obs.index)}
adata.obs['numerical_index'] = adata.obs.index.map(index_mapping)
neighbors = 40
indices = np.zeros((adata.shape[0], neighbors+1))

u = adata.layers["unspliced"].toarray()
s = adata.layers["spliced"].toarray()
u_s = np.concatenate([u, s], axis=1)
# Step 1: Create a nearest-neighbor graph (Euclidean distances for the graph construction)
knn_graph = kneighbors_graph(u_s, n_neighbors=neighbors, mode='distance', include_self=False)

# Step 2: Compute the geodesic distances using Dijkstra's algorithm
# This returns the shortest paths between all pairs of points
geodesic_distances = dijkstra(csgraph=knn_graph, directed=False, return_predecessors=False)

# Step 3: Build the neighbor matrix
# For each cell, find its nearest neighbors based on geodesic distances
nearest_neighbors = []
for i in range(geodesic_distances.shape[0]):
    sorted_indices = np.argsort(geodesic_distances[i])
    nearest_neighbors.append(sorted_indices[:neighbors + 1])  # Include the cell itself

local_nearest_neighbor_matrix = np.array(nearest_neighbors)

# Step 4: Map local indices to numerical indices
numerical_indices = adata.obs["numerical_index"].values
nearest_neighbors_matrix = numerical_indices[local_nearest_neighbor_matrix]

adata.uns["indices"] = np.array(nearest_neighbors_matrix, dtype=int)

adata.write_h5ad("gastrulation_erythroid_common_geodesic.h5ad")