### Import necessary libraries

In [1]:
import joblib
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix

### Load raw data

In [2]:
features = joblib.load("/scratch/ukhan/tabula-sapiens/baseline-otf/gene-features.pkl")
raw = sc.read_h5ad("/scratch/ukhan/ndcn/raw.h5ad")
raw

AnnData object with n_obs × n_vars = 23197 × 33178
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'SORT', 'Amyloid', 'Age', 'RIN', 'nCount_SCT', 'nFeature_SCT', 'nCount_Exon', 'nFeature_Exon', 'PMI', 'Braak', 'Sample.ID', 'Cell.Types', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_pca', 'X_umap'

### Identify missing features

In [3]:
missing_genes = list(set(features) - set(list(raw.var["feature_name"])))
print(f"{len(missing_genes)} features missing.")

398 features missing.


### Create new data matrix

In [4]:
raw_X = raw.X.toarray()
new_X = np.hstack((raw_X, np.zeros((raw_X.shape[0], len(missing_genes)))))
new_X_sparse = csr_matrix(new_X, dtype = np.float32)
new_X_sparse

<23197x33576 sparse matrix of type '<class 'numpy.float32'>'
	with 48643563 stored elements in Compressed Sparse Row format>

### Create new var DataFrame

In [5]:
feature_list = list(raw.var["feature_name"]) + missing_genes
new_var = pd.DataFrame({"name": feature_list, "gene_name": feature_list})
new_var.set_index("name", inplace = True)

### Create new AnnData

In [6]:
new = ad.AnnData(X = new_X_sparse, obs = raw.obs, var = new_var, uns = raw.uns, obsm = raw.obsm)
new

AnnData object with n_obs × n_vars = 23197 × 33576
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'SORT', 'Amyloid', 'Age', 'RIN', 'nCount_SCT', 'nFeature_SCT', 'nCount_Exon', 'nFeature_Exon', 'PMI', 'Braak', 'Sample.ID', 'Cell.Types', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'gene_name'
    uns: 'schema_version', 'title'
    obsm: 'X_pca', 'X_umap'

In [7]:
new.write("/scratch/ukhan/ndcn/new.h5ad")