In [1]:
import numpy as np 
import scanpy as sc
from anndata import AnnData
from scipy import sparse
from tqdm.notebook import tqdm
import requests

In [2]:
adata = sc.read_h5ad('./data/sn_tumor_cells_NB.h5ad')
adata

AnnData object with n_obs × n_vars = 205153 × 20542
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Stage_Code', 'Tissue', 'Risk_Category', 'First_Avail_TP', 'MYCN_Status', 'ALK_Status', 'TP53_Status', 'Response', 'Vital_Status', 'Age_at_IDX_in_months', 'Treatment', 'First_Avail_Time_Point', 'sample_name', 'biospecimen_id', 'percent.mt', 'seurat_clusters', 'sample_label_wo_prefix', 'S.Score', 'G2M.Score', 'Phase', 'malignancy', 'cell_state', 'RNA_snn_res.0.2', 'MES_Score', 'ADRN_Score', 'MES_ADRN_diff', 'Event', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'disease_ontology_term_id', 'tissue_type', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_name', 'vf_vst_coun

In [15]:
def translate_ensembl_to_symbol(ensembl_ids):
    """
    Translate Ensembl gene IDs to gene symbols using Ensembl REST API
    
    Args:
        ensembl_ids: List of Ensembl gene IDs (e.g., ['ENSG00000121410', 'ENSG00000148584'])
    
    Returns:
        List of gene symbols in the same order as input IDs (None for failed translations)
    """
    server = "https://rest.ensembl.org"
    ext = "/lookup/id"
    headers = {"Content-Type": "application/json"}
    
    # Batch query (more efficient than individual requests)
    response = requests.post(
        f"{server}{ext}",
        headers=headers,
        json={"ids": ensembl_ids}
    )
    
    symbol_map = {}
    if response.ok:
        data = response.json()
        for ensembl_id in ensembl_ids:
            if ensembl_id in data:
                symbol_map[ensembl_id] = data[ensembl_id].get("display_name", None)
    
    # Return symbols in original order, with None for failed translations
    return [symbol_map.get(ensembl_id, None) for ensembl_id in ensembl_ids]

In [None]:
translated_var_names = []
for name in tqdm(adata.var_names.tolist()):
    try:
        translation = translate_ensembl_to_symbol([name])[0]
        if translation is None:
            translated_var_names.append(name)
        else:
            translated_var_names.append(translation)
    
    except Exception as e:
        translated_var_names.append(name)
translated_var_names

In [56]:
#save the translated variable names to a file
with open("output/all_translated_var_names.txt", "a") as f:
    for item in translated_var_names:
        f.write(item + "\n")

In [30]:
# Read the translated variable names from the file
with open("output/all_translated_var_names.txt", "r") as f:
    translated_var_names = [line.strip() for line in f]

In [10]:
sc.pp.highly_variable_genes(adata, n_top_genes=4000, flavor="seurat")
adata_hvg = adata[:, adata.var["highly_variable"]].copy()
adata_hvg

AnnData object with n_obs × n_vars = 205153 × 4000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Stage_Code', 'Tissue', 'Risk_Category', 'First_Avail_TP', 'MYCN_Status', 'ALK_Status', 'TP53_Status', 'Response', 'Vital_Status', 'Age_at_IDX_in_months', 'Treatment', 'First_Avail_Time_Point', 'sample_name', 'biospecimen_id', 'percent.mt', 'seurat_clusters', 'sample_label_wo_prefix', 'S.Score', 'G2M.Score', 'Phase', 'malignancy', 'cell_state', 'RNA_snn_res.0.2', 'MES_Score', 'ADRN_Score', 'MES_ADRN_diff', 'Event', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'disease_ontology_term_id', 'tissue_type', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_name', 'vf_vst_count

In [38]:
translated_var_names_4000 = [v for i, v in enumerate(translated_var_names) if adata.var["highly_variable"][i] == True]

  translated_var_names_4000 = [v for i, v in enumerate(translated_var_names) if adata.var["highly_variable"][i] == True]


In [39]:
panglao = sc.read_h5ad('./data/panglao_10000.h5ad')
data = adata_hvg
counts = sparse.lil_matrix((data.X.shape[0], panglao.X.shape[1]), dtype=np.float32)

# Convert var_names to lists
ref = panglao.var_names.tolist()
obj = translated_var_names_4000

# Create a dictionary for fast lookup of index
obj_index_dict = {gene: idx for idx, gene in enumerate(obj)}

# Fill the matrix using the dictionary
for i, gene in tqdm(enumerate(ref)):
    loc = obj_index_dict.get(gene)
    if loc is not None:
        counts[:, i] = data.X[:, loc]

counts = counts.tocsr()
new = AnnData(X=counts)
new.var_names = ref
new.obs_names = data.obs_names
new.obs = data.obs
new.uns = panglao.uns

  utils.warn_names_duplicates("obs")


0it [00:00, ?it/s]

In [50]:
new.write_h5ad('./data/preprocessed_data.h5ad', compression='gzip')