In [1]:
import scanpy as sc
import pandas as pd

In [2]:
orig_adata = sc.read_h5ad("/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/LabelTransfer/adatas/MyeloidAdata.h5ad")

In [3]:
ensembl_version_dict = {'105': 'http://www.ensembl.org',

                        '104': 'http://may2021.archive.ensembl.org/',

                        '103': 'http://feb2021.archive.ensembl.org/',

                        '102': 'http://nov2020.archive.ensembl.org/',

                        '101': 'http://aug2020.archive.ensembl.org/',

                        '100': 'http://apr2020.archive.ensembl.org/',

                        '99': 'http://jan2020.archive.ensembl.org/',

                        '98': 'http://sep2019.archive.ensembl.org/',

                        '97': 'http://jul2019.archive.ensembl.org/',

                        '96': 'http://apr2019.archive.ensembl.org/',

                        '95': 'http://jan2019.archive.ensembl.org/',

                        '94': 'http://oct2018.archive.ensembl.org/',

                        '93': 'http://jul2018.archive.ensembl.org/',

                        '92': 'http://apr2018.archive.ensembl.org/',

                        '91': 'http://dec2017.archive.ensembl.org/',

                        '90': 'http://aug2017.archive.ensembl.org/',

                        '89': 'http://may2017.archive.ensembl.org/',

                        '88': 'http://mar2017.archive.ensembl.org/',

                        '87': 'http://dec2016.archive.ensembl.org/',

                        '86': 'http://oct2016.archive.ensembl.org/',

                        '80': 'http://may2015.archive.ensembl.org/',

                        '77': 'http://oct2014.archive.ensembl.org/',

                        '75': 'http://feb2014.archive.ensembl.org/',

                        '54': 'http://may2009.archive.ensembl.org/'}



In [4]:
import pybiomart as pbm

In [5]:
def test_ensembl_host(adata, host):

    dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host=host)
    
    qp = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
    print(f'Nº of genes in db: {len(qp)}')
    
    df_var = pd.DataFrame(adata.var_names.values, columns=["Gene stable ID"])
    print(f'Nº of genes in dataset: {len(df_var)}')
    
    df_mapped = df_var.merge(qp, how="inner", on="Gene stable ID")
    print(f'Nº of mapped genes to db: {len(df_mapped)}')
    
    df_valid = df_mapped[~df_mapped["Gene name"].isna()]
    ov = len(df_valid)
    print(f'Nº of valid mapped genes to db: {ov}')
    
    return(ov)


In [6]:
n_overlap = {}

for version in ensembl_version_dict.keys():

    print(f'host: {version}')

    try:

        n_overlap[version] =  test_ensembl_host(orig_adata, ensembl_version_dict[version])

    except:

        print('Host not reachable')

v = sorted(n_overlap.items(), key=lambda item: item[1], reverse=True)[0][0]

host_to_use = ensembl_version_dict[v]
print(f"version: {v} has the largest overlap, use {host_to_use} as biomart host")



host: 105
Nº of genes in db: 86402
Nº of genes in dataset: 45453
Nº of mapped genes to db: 44653
Nº of valid mapped genes to db: 35967
host: 104
Nº of genes in db: 67128
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45453
Nº of valid mapped genes to db: 35225
host: 103
Nº of genes in db: 67130
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45453
Nº of valid mapped genes to db: 45453
host: 102
Nº of genes in db: 67139
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 101
Nº of genes in db: 67130
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 100
Nº of genes in db: 67149
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 99
Nº of genes in db: 67140
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45448
Nº of valid mapped genes to db: 45448
host: 98
Nº of genes in db: 67087
Nº of genes in dataset

In [7]:
def map_genes(adata, host):

    dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host=host)
    
    qp = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
    print(f'Nº of genes in db: {len(qp)}')
    
    df_var = pd.DataFrame(adata.var_names.values, columns=["Gene stable ID"])
    print(f'Nº of genes in dataset: {len(df_var)}')
    
    df_mapped = df_var.merge(qp, how="inner", on="Gene stable ID")
    print(f'Nº of mapped genes to db: {len(df_mapped)}')
    
    df_valid = df_mapped[~df_mapped["Gene name"].isna()]

    
    gene_order = df_valid["Gene stable ID"].tolist()

    adata = adata[:, gene_order].copy() 

    adata.var_names = df_valid['Gene name'].values

    return(adata)   
    

In [8]:
mod_adata = map_genes(orig_adata, host_to_use)

Nº of genes in db: 67130
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45453


In [9]:
mod_adata.var_names

Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112', 'FGR', 'CFH', 'FUCA2',
       'GCLC', 'NFYA',
       ...
       'AP002991.2', 'AL355338.1', 'AC005618.4', 'AC126768.3', 'C3orf36',
       'C8orf44', 'C8orf44-SGK3', 'SNORA74C-2', 'XGY2', 'FLJ43315'],
      dtype='object', length=45453)

In [10]:
query_adata = sc.read_h5ad("/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/Integration/adata/adata_scanvi_predictions.h5ad")

In [11]:
len(set(query_adata.var_names).intersection(set(mod_adata.var_names)))

17262

In [12]:
common_genes = [gene for gene in mod_adata.var_names if gene in query_adata.var_names]

In [13]:
query_matched = query_adata[:,common_genes].copy()

  utils.warn_names_duplicates("var")


In [15]:
len(query_adata.var_names)

18088

In [16]:
len(query_matched.var_names)

17269

In [21]:
query_matched.obs.GennAnno_ScAnvi.cat.categories

Index(['B cells', 'Cycling cells', 'Endothelial', 'Epithelial', 'Mast cells',
       'Mesenchymal', 'Myeloid', 'Plasma cells', 'T cells'],
      dtype='object')

In [22]:
myeloid_query = query_matched[query_matched.obs.GennAnno_ScAnvi.isin(['Myeloid','Mast cells']),:]

In [23]:
myeloid_query

View of AnnData object with n_obs × n_vars = 14380 × 17269
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'complexity', 'doublet_scores', 'predicted_doublets', 'n_genes', 'n_counts', 'celltypist_labels_Immune_All_High', 'celltypist_scores_Immune_All_High', 'celltypist_labels_Cells_Adult_Breast', 'celltypist_scores_Cells_Adult_Breast', 'batch', 'subtype', 'leiden', 'celltypist_labels_Immune_All_High_highconf', 'GenAnno', 'GenAnnoV1', 'GenAnnoV2', 'celltypist_mapped', 'GenAnnoHighConf', 'cell_type', 'predicted_labels', 'GennAnno_ScAnvi'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'GenAnnoHighConf_colors', 'GenAnnoV1_colors', 'GenAnnoV2_colors', 'GennAnno_ScAnvi_colors', 'celltypist_labels_Immune_All_High_highconf_colors', 'hvg', 'leiden', '