In [2]:
import numpy as np 
import scanpy as sc
from anndata import AnnData
from scipy import sparse
from tqdm.notebook import tqdm

In [2]:
panglao = sc.read_h5ad('./data/panglao_10000.h5ad')
adata = sc.read_h5ad('./data/NB.bone.Met_preprocessed.h5ad')
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 36763 × 25139
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    var: 'hgnc_symbol', 'ensembl_gene_id', 'gene_biotype', 'n_cells', 'mean', 'std'
    uns: 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'RNA', 'log1p_norm'
    obsp: 'connectivities', 'distances'

In [4]:
#filter genes
sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=1000, layer="log1p_norm")
adata_filterd = adata[:, adata.var['highly_variable']]
adata_filterd.X = adata_filterd.layers['RNA'].copy()
adata_filterd

  adata_filterd.X = adata_filterd.layers['RNA'].copy()


View of AnnData object with n_obs × n_vars = 36763 × 1000
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    var: 'hgnc_symbol', 'ensembl_gene_id', 'gene_biotype', 'n_cells', 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors', 'pca', 'umap', 'hvg'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'RNA', 'log1p_norm'
    obsp: 'connectivities', 'distances'

In [5]:
#preprocessing step from Tencent
def scbert_preprocessing(data, panglao):
    counts = sparse.lil_matrix((data.X.shape[0],panglao.X.shape[1]),dtype=np.float32)
    ref = panglao.var_names.tolist()
    obj = data.var_names.tolist()

    for i in range(len(ref)):
        if ref[i] in obj:
            loc = obj.index(ref[i])
            counts[:,i] = data.X[:,loc]

    counts = counts.tocsr()
    new = AnnData(X=counts)
    new.var_names = ref
    new.obs_names = data.obs_names
    new.obs = data.obs
    new.uns = panglao.uns

    # sc.pp.filter_cells(new, min_genes=200)
    sc.pp.normalize_total(new, target_sum=1e4)
    sc.pp.log1p(new, base=2)
    return new

In [6]:
adata_preprocessed = scbert_preprocessing(adata_filterd, panglao)
adata_preprocessed

  getattr(self, attr).index = value




AnnData object with n_obs × n_vars = 36763 × 16906
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    uns: 'log1p'

In [None]:
# remove genes with 0 correspondance to our data to fasten computation
# genes must also be removed from the Tencent gene2vec dictionnary
gene2vec = np.load('./data/gene2vec_16906.npy')

non_zero_columns = (adata_preprocessed.X.toarray() != 0).any(axis=0)
non_zero_adata_preprocessed = adata_preprocessed[:, non_zero_columns]
non_zero_gene2vec = gene2vec[non_zero_columns,:]

print(non_zero_adata_preprocessed)
print(non_zero_gene2vec.shape)

View of AnnData object with n_obs × n_vars = 36763 × 678
    obs: 'cell1', 'cell2', 'fraction', 'sample', 'cell_ID', 'n_genes', 'leiden'
    uns: 'log1p'
(678, 200)


In [32]:
non_zero_adata_preprocessed.write_h5ad('./data/non_zero_preprocessed_data_1000.h5ad')
np.save('./data/non_zero_gene2vec_1000.npy', non_zero_gene2vec)