In [6]:
import numpy as np 
import scanpy as sc
from anndata import AnnData
from scipy import sparse

In [13]:
panglao = sc.read_h5ad('./data/panglao_10000.h5ad')
data = sc.read_h5ad('./data/NB.bone.Met_preprocessed.h5ad')
data.X = data.layers['RNA']
data

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 36763 × 25139
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    var: 'hgnc_symbol', 'ensembl_gene_id', 'gene_biotype', 'n_cells', 'mean', 'std'
    uns: 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'RNA', 'log1p_norm'
    obsp: 'connectivities', 'distances'

In [5]:
counts = sparse.lil_matrix((data.X.shape[0],panglao.X.shape[1]),dtype=np.float32)
ref = panglao.var_names.tolist()
obj = data.var_names.tolist()

for i in range(len(ref)):
    if ref[i] in obj:
        loc = obj.index(ref[i])
        counts[:,i] = data.X[:,loc]

counts = counts.tocsr()
new = AnnData(X=counts)
new.var_names = ref
new.obs_names = data.obs_names
new.obs = data.obs
new.uns = panglao.uns

sc.pp.filter_cells(new, min_genes=200)
sc.pp.normalize_total(new, target_sum=1e4)
sc.pp.log1p(new, base=2)



In [None]:
# Get gene names as numpy arrays (faster than lists)
ref_genes = panglao.var_names.values
obj_genes = data.var_names.values

# Create a mapping from gene name to column index for the query data
gene_to_idx = {gene: i for i, gene in enumerate(obj_genes)}

# Prepare indices and data for COO matrix construction
rows = []
cols = []
data_vals = []

# Build indices for common genes
for i, gene in enumerate(ref_genes):
    if i % 500 == 0:
        print(i)
    if gene in gene_to_idx:
        # Get all non-zero entries for this gene in data.X
        col_in_data = gene_to_idx[gene]
        sparse_col = data.X[:, col_in_data]
        
        if sparse.issparse(sparse_col):
            # If it's sparse, get the non-zero entries
            sparse_col = sparse_col.tocoo()
            rows.extend(sparse_col.row)
            cols.extend([i] * len(sparse_col.row))  # i is the column in output matrix
            data_vals.extend(sparse_col.data)
        else:
            # If dense (unlikely), handle differently
            non_zero = np.nonzero(sparse_col)[0]
            rows.extend(non_zero)
            cols.extend([i] * len(non_zero))
            data_vals.extend(sparse_col[non_zero])

# Create the matrix in COO format (most efficient for construction)
counts = sparse.coo_matrix(
    (data_vals, (rows, cols)),
    shape=(data.X.shape[0], panglao.X.shape[1]),
    dtype=np.float32
).tocsr()  # Convert to CSR for efficient operations
new = AnnData(X=counts)
new.var_names = ref
new.obs_names = data.obs_names
new.obs = data.obs
new.uns = panglao.uns

sc.pp.filter_cells(new, min_genes=200)
sc.pp.normalize_total(new, target_sum=1e4)
sc.pp.log1p(new, base=2)
#new.write('./data/preprocessed_data.h5ad')

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500


In [11]:
preprocessed_data = sc.read_h5ad('./data/preprocessed_data.h5ad')
preprocessed_data

AnnData object with n_obs × n_vars = 36712 × 16906
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    uns: 'log1p'