In [1]:
import sys
print(f'This notebook was last run with this kernel {sys.executable}')

This notebook was last run with this kernel /home/igarzonalva/.conda/envs/scvi-env/bin/python


# BEFORE TRANSFERING

## Environment setup

In [2]:
import scanpy as sc 
import pandas as pd
import pybiomart as pbm
import os 

## Helper Functions 

In [3]:
ensembl_version_dict = {'105': 'http://www.ensembl.org',

                        '104': 'http://may2021.archive.ensembl.org/',

                        '103': 'http://feb2021.archive.ensembl.org/',

                        '102': 'http://nov2020.archive.ensembl.org/',

                        '101': 'http://aug2020.archive.ensembl.org/',

                        '100': 'http://apr2020.archive.ensembl.org/',

                        '99': 'http://jan2020.archive.ensembl.org/',

                        '98': 'http://sep2019.archive.ensembl.org/',

                        '97': 'http://jul2019.archive.ensembl.org/',

                        '96': 'http://apr2019.archive.ensembl.org/',

                        '95': 'http://jan2019.archive.ensembl.org/',

                        '94': 'http://oct2018.archive.ensembl.org/',

                        '93': 'http://jul2018.archive.ensembl.org/',

                        '92': 'http://apr2018.archive.ensembl.org/',

                        '91': 'http://dec2017.archive.ensembl.org/',

                        '90': 'http://aug2017.archive.ensembl.org/',

                        '89': 'http://may2017.archive.ensembl.org/',

                        '88': 'http://mar2017.archive.ensembl.org/',

                        '87': 'http://dec2016.archive.ensembl.org/',

                        '86': 'http://oct2016.archive.ensembl.org/',

                        '80': 'http://may2015.archive.ensembl.org/',

                        '77': 'http://oct2014.archive.ensembl.org/',

                        '75': 'http://feb2014.archive.ensembl.org/',

                        '54': 'http://may2009.archive.ensembl.org/'}

def test_ensembl_host(adata, host):

    dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host=host)
    
    qp = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
    print(f'Nº of genes in db: {len(qp)}')
    
    df_var = pd.DataFrame(adata.var_names.values, columns=["Gene stable ID"])
    print(f'Nº of genes in dataset: {len(df_var)}')
    
    df_mapped = df_var.merge(qp, how="inner", on="Gene stable ID")
    print(f'Nº of mapped genes to db: {len(df_mapped)}')
    
    df_valid = df_mapped[~df_mapped["Gene name"].isna()]
    ov = len(df_valid)
    print(f'Nº of valid mapped genes to db: {ov}')
    
    return(ov)

def map_genes(adata, host):

    dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host=host)
    
    qp = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
    print(f'Nº of genes in db: {len(qp)}')
    
    df_var = pd.DataFrame(adata.var_names.values, columns=["Gene stable ID"])
    print(f'Nº of genes in dataset: {len(df_var)}')
    
    df_mapped = df_var.merge(qp, how="inner", on="Gene stable ID")
    print(f'Nº of mapped genes to db: {len(df_mapped)}')
    
    df_valid = df_mapped[~df_mapped["Gene name"].isna()]

    
    gene_order = df_valid["Gene stable ID"].tolist()

    adata = adata[:, gene_order].copy() 

    adata.var_names = df_valid['Gene name'].values
    
    return(adata)   

In [4]:
REF_ADATA_DIR = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/LabelTransfer/adatas/Myeloid/common_resources" 
TARGET_ADATA_DIR = '/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/02_Integration/adata'
SAVING_ADATA_DIR = '/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/LabelTransfer/adatas/Myeloid'

In [5]:
CONDITION_KEY = "harm_study"
CELL_TYPE_KEY = "author_cell_type"
UNLABELED_CATEGORY = "unknown"


TARGET_ANNOTATION_COLUMN = "IGA_PostScAnvi_GenAnno_colors"
TARGET_CELL_TYPES = ['Myeloid']

## Data Loading

### Reference 

In [6]:
ref_adata_fname = os.path.join(REF_ADATA_DIR, 'MyeloidAdata.h5ad')
ref_adata = sc.read_h5ad(ref_adata_fname)
ref_adata = ref_adata.raw.to_adata() 

### Target

In [7]:
target_adata_fname = os.path.join(TARGET_ADATA_DIR, 'adata_scanvi_cuda_refinement.h5ad')
target_adata = sc.read_h5ad(target_adata_fname)
target_adata.X = target_adata.layers["counts"].copy()
target_adata = target_adata[target_adata.obs[TARGET_ANNOTATION_COLUMN].isin(TARGET_CELL_TYPES),:].copy()
target_adata.obs[TARGET_ANNOTATION_COLUMN].value_counts()

IGA_PostScAnvi_GenAnno_colors
Myeloid    19409
Name: count, dtype: int64

### Reference Ensembl to Symbol

In [8]:
n_overlap = {}
for version in ensembl_version_dict.keys():

    print(f'host: {version}')

    try:

        n_overlap[version] =  test_ensembl_host(ref_adata, ensembl_version_dict[version])

    except:

        print('Host not reachable')
v = sorted(n_overlap.items(), key=lambda item: item[1], reverse=True)[0][0]
host_to_use = ensembl_version_dict[v]
print(f"version: {v} has the largest overlap, use {host_to_use} as biomart host")

# Use ensemble biomart to map genes in the reference dataset  
ref_adata_renamed = map_genes(ref_adata, host_to_use)

host: 105
Nº of genes in db: 86402
Nº of genes in dataset: 45453
Nº of mapped genes to db: 44653
Nº of valid mapped genes to db: 35967
host: 104
Nº of genes in db: 67128
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45453
Nº of valid mapped genes to db: 35225
host: 103
Nº of genes in db: 67130
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45453
Nº of valid mapped genes to db: 45453
host: 102
Nº of genes in db: 67139
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 101
Nº of genes in db: 67130
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 100
Nº of genes in db: 67149
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45451
Nº of valid mapped genes to db: 45451
host: 99
Nº of genes in db: 67140
Nº of genes in dataset: 45453
Nº of mapped genes to db: 45448
Nº of valid mapped genes to db: 45448
host: 98
Nº of genes in db: 67087
Nº of genes in dataset

## Dataset harmonization

In [9]:
common_genes = [gene for gene in ref_adata_renamed.var_names if gene in target_adata.var_names]

ref_adata_renamed.var_names_make_unique()
ref_adata_renamed = ref_adata_renamed[:,common_genes].copy()

target_adata = target_adata[:,common_genes].copy()

target_adata.var_names_make_unique()
ref_adata_renamed.var_names_make_unique()


all(ref_adata_renamed.var_names == target_adata.var_names)
nmatch = len(set(target_adata.var_names).intersection(set(ref_adata_renamed.var_names)))
print(f'Number of genes in common after filtering finishes: {nmatch}')

  utils.warn_names_duplicates("var")


Number of genes in common after filtering finishes: 21537


  utils.warn_names_duplicates("var")


In [10]:
# Compute HVGs on raw data of reference dataset
sc.pp.highly_variable_genes(ref_adata_renamed, flavor='seurat_v3', n_top_genes=3000, subset = False, inplace = True)
all(ref_adata_renamed.var_names == target_adata.var_names)

True

In [11]:
# Slice ref and target adatas to keep ref HVGs
ref_adata_hvg = ref_adata_renamed[:, ref_adata_renamed.var['highly_variable']].copy()
target_adata_hvg = target_adata[:, ref_adata_renamed.var['highly_variable']].copy()

all(ref_adata_hvg.var_names == target_adata_hvg.var_names)

True

In [12]:
ref_adata_hvg.obs[CONDITION_KEY] = "study1"
target_adata_hvg.obs[CONDITION_KEY] = "study2"

In [13]:
target_adata_hvg.obs[CELL_TYPE_KEY] = UNLABELED_CATEGORY

In [14]:
ref_adata_hvg.obs[CELL_TYPE_KEY].value_counts()

author_cell_type
Fibroblasts              48013
Endothelial              28726
uveal melanoma_normal    23628
B_cells                  18491
TCD4_naive               18364
uveal melanoma_tumor     18137
lung_normal              15566
TCD4_em                  13032
TCD8_ex                  12923
ovary_tumor              12277
lung_tumor                9595
TCD8_em                   9380
TCD4_reg                  9233
Mac_Alv-like              8759
NK_cyto                   8019
Neutrophil_TAGLN2         6420
ovary_normal              6361
TCD8_naive                6255
TCD4_ex                   4240
Mast_cells                4108
Neutrophil_MMP9           3947
NKT                       3869
breast_tumor              3829
NK_rest                   3739
Mac_Rec                   3717
colorectal_normal         3694
Mac_AgPres                3329
RTM_LA                    3223
RTM_IM                    3198
Mac_Angio                 2969
breast_normal             2897
skin_normal           

In [15]:
target_adata_hvg.obs[CELL_TYPE_KEY].value_counts()

author_cell_type
unknown    19409
Name: count, dtype: int64

## Data saving

In [16]:
target_adata_hvg.write_h5ad(os.path.join(SAVING_ADATA_DIR,'./target_adata.h5ad'))
ref_adata_hvg.write_h5ad(os.path.join(SAVING_ADATA_DIR,'./ref_adata.h5ad'))

# AFTER TRANSFERING

In [None]:

# Get the latent representation of the query dataset
query_latent = sc.AnnData(model.get_latent_representation())
query_latent.obs['cell_type'] = target_adata.obs[cell_type_key].tolist()
query_latent.obs['batch'] = target_adata.obs[condition_key].tolist()
query_latent.obs['predictions'] = model.predict()
query_latent.write_h5ad(os.path.join(adata_dir,'query_latent_myeloid.h5ad'))

####################### Embedding in common latent space ###########
adata_full = source_adata.concatenate(target_adata)
full_latent = sc.AnnData(model.get_latent_representation(adata=adata_full))
full_latent.obs['cell_type'] = adata_full.obs[cell_type_key].tolist()
full_latent.obs['batch'] = adata_full.obs[condition_key].tolist()
full_latent.write_h5ad(os.path.join(adata_dir,'full_latent_myeloid.h5ad'))