In [26]:
import scanpy as sc
import scvelo as scv
import numpy as np
from sklearn.neighbors import kneighbors_graph
from scipy.sparse.csgraph import dijkstra
from utils import color_keys
import pandas as pd

In [27]:
gastrulation = sc.read_h5ad("gastr_full_separate_smoothing/gastrulation_processed.h5ad")
pancreas = scv.datasets.pancreas()
scv.pp.moments(pancreas)


Normalized count data: X, spliced, unspliced.
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:03) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


In [18]:
# Assuming 'pancreas' and 'gastrulation' are loaded AnnData objects
gastrulation_genes = gastrulation.var_names  # All genes in gastrulation

# Initialize new matrices filled with zeros for unspliced, spliced, and overall data
new_pancreas_X = np.zeros((pancreas.n_obs, len(gastrulation_genes)))
new_pancreas_Mu = np.zeros((pancreas.n_obs, len(gastrulation_genes)))
new_pancreas_Ms = np.zeros((pancreas.n_obs, len(gastrulation_genes)))

# Use AnnData.to_df for converting the data into DataFrames
pancreas_X_df = pancreas.to_df()
pancreas_Mu_df = pancreas.to_df(layer="Mu")
pancreas_Ms_df = pancreas.to_df(layer="Ms")

# Align the data by filling the values for genes that are common
common_genes = pancreas.var_names.intersection(gastrulation_genes)
new_pancreas_X[:, gastrulation_genes.isin(common_genes)] = pancreas_X_df[common_genes].values
new_pancreas_Mu[:, gastrulation_genes.isin(common_genes)] = pancreas_Mu_df[common_genes].values
new_pancreas_Ms[:, gastrulation_genes.isin(common_genes)] = pancreas_Ms_df[common_genes].values

# Create a new AnnData object with the aligned and padded data
new_pancreas = sc.AnnData(
    X=new_pancreas_X,
    obs=pancreas.obs,
    var=pd.DataFrame(index=gastrulation_genes)
)

# Add the aligned unspliced and spliced data as layers
new_pancreas.layers["Mu"] = new_pancreas_Mu
new_pancreas.layers["Ms"] = new_pancreas_Ms
new_pancreas.layers["unspliced"] = new_pancreas_Mu  # Add unspliced layer
new_pancreas.layers["spliced"] = new_pancreas_Ms  # Add spliced layer

# Copy other necessary metadata from the original pancreas dataset if needed
new_pancreas.uns = pancreas.uns  # Copy uns if needed
new_pancreas.obsm = pancreas.obsm  # Copy obsm if needed
new_pancreas.varm = pancreas.varm  # Copy varm if needed

# Update 'pancreas' variable if you want to replace the original with the new aligned version
pancreas = new_pancreas.copy()

# Verify alignment
print("Pancreas shape:", pancreas.shape)
print("Mu shape:", pancreas.layers['Mu'].shape)
print("Ms shape:", pancreas.layers['Ms'].shape)
print("Unspliced shape:", pancreas.layers['unspliced'].shape)
print("Spliced shape:", pancreas.layers['spliced'].shape)


Pancreas shape: (3696, 2000)
Mu shape: (3696, 2000)
Ms shape: (3696, 2000)
Unspliced shape: (3696, 2000)
Spliced shape: (3696, 2000)


In [19]:
(gastrulation.var_names == pancreas.var_names).all()

True

In [8]:
pancreas.uns["gastrulation_var_names"] = np.array(gastrulation.var_names)

In [20]:
def compute_bin_ranges(data, num_bins):
    """
    Compute bin edges for each gene based on expression values.
    """
    max_values = np.max(data, axis=0)
    bin_ranges = max_values / num_bins
    return bin_ranges  # Shape: (num_genes,)

# Assuming adata.layers["Mu"] and adata.layers["Ms"] are available
combined_expression = np.concatenate([gastrulation.layers["Mu"], gastrulation.layers["Ms"]], axis=1)
bin_ranges = compute_bin_ranges(combined_expression, num_bins=50)

# Save bin_ranges for later use
pancreas.uns["bin_ranges"] = bin_ranges

In [21]:
neighbors=15
indices = np.zeros((pancreas.shape[0], neighbors+1))
u = pancreas.layers["unspliced"]
s = pancreas.layers["spliced"]
u_s = np.concatenate([u, s], axis=1)
# Step 1: Create a nearest-neighbor graph (Euclidean distances for the graph construction)
knn_graph = kneighbors_graph(u_s, n_neighbors=neighbors, mode='distance', include_self=False)

# Step 2: Compute the geodesic distances using Dijkstra's algorithm
# This returns the shortest paths between all pairs of points
geodesic_distances = dijkstra(csgraph=knn_graph, directed=False, return_predecessors=False)

# For each cell, find its nearest neighbors based on geodesic distances
nearest_neighbors = []
for i in range(geodesic_distances.shape[0]):
    sorted_indices = np.argsort(geodesic_distances[i])
    nearest_neighbors.append(sorted_indices[:neighbors + 1])  # Include the cell itself

local_nearest_neighbor_matrix = np.array(nearest_neighbors)

indices = local_nearest_neighbor_matrix

pancreas.uns["indices"] = np.array(indices, dtype=int)

In [22]:
pancreas[:,-1].layers["Mu"]

ArrayView([[0.],
           [0.],
           [0.],
           ...,
           [0.],
           [0.],
           [0.]])

In [23]:
pancreas

AnnData object with n_obs × n_vars = 3696 × 2000
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score', 'n_counts'
    uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca', 'bin_ranges', 'indices'
    obsm: 'X_pca', 'X_umap'
    layers: 'Mu', 'Ms', 'unspliced', 'spliced'

In [24]:
pancreas.write_h5ad("pancreas-gastr->pancr_transfer.h5ad")