### Notebook for thre manual annotation of cell states of the healthy - CTRL samples 

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221211

### Import required modules

In [1]:
import scvi
import scib
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

from SCCAF import SCCAF_assessment, plot_roc

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                 9.3.0
SCCAF               NA
absl                NA
asttokens           NA
attr                22.1.0
backcall            0.2.0
chex                0.1.5
comm                0.1.2
contextlib2         NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.4
decorator           5.1.1
deprecate           0.3.2
deprecated          1.2.13
docrep              0.3.2
entrypoints         0.4
etils               0.9.0
executing           1.2.0
flax                0.6.3
fsspec              2022.11.0
google              NA
h5py                3.7.0
igraph              0.10.2
ipykernel           6.19.2
ipywidgets          8.0.3
jax                 0.3.25
jaxlib              0.3.25
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.9.0
llvmlite            0.39.1
louvain             0.8.0
matplotlib          3.6.2
ml_collections      NA


  IPython.display.set_matplotlib_formats(*ipython_format)


### Read in Healthy-CTRL dataset

In [3]:
H_ctrl = sc.read_h5ad('/home/cartalop/data/carlos/single_cell/COPD_IAV/scanvi_annotated/BrEpit_Healthy_CTRL_ctl221129_scANVI_annot.raw.h5ad') 
H_ctrl

AnnData object with n_obs × n_vars = 32767 × 18482
    obs: 'sex', 'age', 'condition', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'sample_group', 'seed_labels', 'C_scANVI'
    var: 'mt', 'ribo', 'n_cells_by_counts-V1', 'mean_counts-V1', 'pct_dropout_by_counts-V1', 'total_counts-V1', 'n_cells_by_counts-V2', 'mean_counts-V2', 'pct_dropout_by_counts-V2', 'total_counts-V2', 'n_cells_by_counts-V3', 'mean_counts-V3', 'pct_dropout_by_counts-V3', 'total_counts-V3', 'n_cells_by_counts-V4', 'mean_counts-V4', 'pct_dropout_by_counts-V4', 'total_counts-V4', 'n_cells_by_counts-V5', 'mean_counts-V5', 'pct_dropout_by_counts-V5', 'total_counts-V5', 'n_cells_by_counts-V6', 'mean_counts-V6', 'pct_drop

### Define cell types from `scANVI` label transfer

In [4]:
H_ctrl.obs['C_scANVI'].value_counts()

Goblet                     8769
Club                       8364
Suprabasal                 6835
Basal resting              3305
Multiciliated              2503
Adventitial fibroblasts    2331
Transitional Club-AT2       533
Ionocyte                    111
Alveolar fibroblasts         10
Pericytes                     6
Name: C_scANVI, dtype: int64

In [5]:
H_ctrl.obs['cell_type'] = H_ctrl.obs['C_scANVI'].copy()

### Format for `scVI`

In [6]:
H_ctrl_raw = H_ctrl.copy()
H_ctrl.layers['counts'] = H_ctrl.X.copy()

sc.pp.highly_variable_genes(
    H_ctrl,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "batch",
    subset = True
)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes




--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


### Run `scVI`

In [7]:
scvi.model.SCVI.setup_anndata(H_ctrl, batch_key = 'batch', layer = 'counts')



In [8]:
vae = scvi.model.SCVI(H_ctrl, n_layers = 3, n_latent = 50, gene_likelihood = "nb", dispersion = 'gene-batch')

In [10]:
vae.train()

: 

: 

In [None]:
H_ctrl.obsm["X_scVI"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.3, spread = 1, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['group', 'disease', 'infection', 'C_scANVI', 'seed_labels'], size = 1, legend_fontsize = 5, ncols = 3)