In [1]:
import sys
print(f'This notebook was last run with this kernel {sys.executable}')

This notebook was last run with this kernel /home/igarzonalva/.conda/envs/scvi_cpu_env/bin/python


In [2]:
import scanpy as sc
import os
import numpy as np
import pandas as pd
import scvi

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# BEFORE TRANSFERING

## Environment setup

In [3]:
REF_ADATA_DIR = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/LabelTransfer/adatas/Fibroblast/common_resources" 
TARGET_ADATA_DIR = '/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/02_Integration/adata'

SAVING_ADATA_DIR = '/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/LabelTransfer/adatas/Fibroblast'

In [4]:
# Specify the condition and cell type key in the reference
CONDITION_KEY = "condition"
CELL_TYPE_KEY = "CAF"
UNLABELED_CATEGORY = "unknown"

# Specify the query annotation column and desired cell types
TARGET_ANNOTATION_COLUMN = "IGA_PostScAnvi_GenAnno_colors"
TARGET_CELL_TYPES = ['Fibroblast FAP+','Fibroblast FAP-']

## Data Loading

### Reference

Reference dataset is already raw data (counts)

In [5]:
reference_adata_fname = os.path.join(REF_ADATA_DIR,'BREAST_fibro_tumor.h5ad')
ref_adata = sc.read_h5ad(reference_adata_fname)

### Target

In [6]:
target_adata_fname = os.path.join(TARGET_ADATA_DIR, 'adata_scanvi_cuda_refinement.h5ad')
target_adata = sc.read_h5ad(target_adata_fname)
target_adata = target_adata[target_adata.obs[TARGET_ANNOTATION_COLUMN].isin(TARGET_CELL_TYPES),:].copy()
target_adata.obs[TARGET_ANNOTATION_COLUMN].value_counts()

IGA_PostScAnvi_GenAnno_colors
Fibroblast FAP+    11062
Fibroblast FAP-     2278
Name: count, dtype: int64

## Dataset harmonization

In [7]:
common_genes = [gene for gene in ref_adata.var_names if gene in target_adata.var_names]

ref_adata = ref_adata[:,common_genes].copy()
target_adata = target_adata[:,common_genes].copy()

all(ref_adata.var_names == target_adata.var_names)

True

In [8]:
# Compute HVGs on raw data of reference dataset
sc.pp.highly_variable_genes(ref_adata, flavor='seurat_v3', n_top_genes=3000, subset = False, inplace = True)
all(ref_adata.var_names == target_adata.var_names)

True

In [9]:
# Slice ref and target adatas to keep ref HVGs
ref_adata_hvg = ref_adata[:, ref_adata.var['highly_variable']].copy()
target_adata_hvg = target_adata[:, ref_adata.var['highly_variable']].copy()

all(ref_adata_hvg.var_names == target_adata_hvg.var_names)

True

In [10]:
ref_adata_hvg.obs[CONDITION_KEY] = "study1"
target_adata_hvg.obs[CONDITION_KEY] = "study2"

In [11]:
target_adata_hvg.obs[CELL_TYPE_KEY] = UNLABELED_CATEGORY

In [12]:
ref_adata_hvg.obs[CELL_TYPE_KEY].value_counts()

CAF
mCAF         4525
iCAF         3439
vCAF         2886
Pericyte     2389
apCAF         793
tpCAF         786
hsp_tpCAF     722
IDO_CAF       665
rCAF          373
dCAF          126
Name: count, dtype: int64

In [13]:
target_adata_hvg.obs[CELL_TYPE_KEY].value_counts()

CAF
unknown    13340
Name: count, dtype: int64

## Data saving

In [16]:
target_adata_hvg.write_h5ad(os.path.join(SAVING_ADATA_DIR,'./target_adata.h5ad'))
ref_adata_hvg.write_h5ad(os.path.join(SAVING_ADATA_DIR,'./ref_adata.h5ad'))

# AFTER TRANSFERING

In [None]:

# Get the latent representation of the query dataset
query_latent = sc.AnnData(model.get_latent_representation())
query_latent.obs['cell_type'] = target_adata.obs[cell_type_key].tolist()
query_latent.obs['batch'] = target_adata.obs[condition_key].tolist()
query_latent.obs['predictions'] = model.predict()
query_latent.write_h5ad(os.path.join(adata_dir,'query_latent_myeloid.h5ad'))

####### EMBBED BOTH THE REFERENCE AND SURGERY MODEL IN THE SAME SPACE #########

adata_full = source_adata.concatenate(target_adata)
full_latent = sc.AnnData(model.get_latent_representation(adata=adata_full))
full_latent.obs['cell_type'] = adata_full.obs[cell_type_key].tolist()
full_latent.obs['batch'] = adata_full.obs[condition_key].tolist()
full_latent.write_h5ad(os.path.join(adata_dir,'full_latent_myeloid.h5ad'))