# Integration overview

In [None]:
import sys
import os
import anndata as ad
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

## Batch

Samples from this study were sequenced in three seperate batches

In [5]:
pd.read_csv('../resources/Foster_2024_batch_id.csv').head(2)

Unnamed: 0,donor_id,batch_id
0,Foster_2024.HD1,batch2
1,Foster_2024.HD2,batch2


## Quality control

In [None]:
%%capture

adata = sc.read('data/raw_data/raw_data.h5ad')

# Gene group calling
adata.var['mt'] = adata.var_names.str.startswith(("MT"))
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))
adata.var['ig'] = adata.var_names.str.startswith(("IGK", "IGH", "IGL",))
adata.var['tcr'] = adata.var_names.str.startswith(("TRAV", "TRBV","TRD", "TRG"))

# Gene %
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','hb','ig'], percent_top=None, log1p=False, inplace=True)
adata.obs['log10_counts'] = np.log10(adata.obs['total_counts'])

# Conventional QC thresholding
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_cells(adata, min_counts=500)
adata = adata[adata.obs.pct_counts_mt < 10, :]
adata = adata[adata.obs.pct_counts_hb < 20, :]

# Filter >100 cells
cells_per_sample = adata.obs['sample_id'].value_counts()
adata = adata[adata.obs.sample_id.isin(cells_per_sample[cells_per_sample>100].index),:]

# Doublet annotation
import scrublet as scr
adata.obs['scr_doublet'] = float('NaN')
adata.obs['scr_doublet_score'] = float('NaN')
for sample_i_name in adata.obs['sample_id'].unique():
    print(sample_i_name)
    sample_i = adata[adata.obs['sample_id'] == sample_i_name, :]
    # Given a raw (unnormalized) UMI counts matrix counts_matrix with cells as rows and genes as columns,
    # calculate a doublet score for each cell
    scrub = scr.Scrublet(sample_i.X)
    doublet_scores, predicted_doublets = scrub.scrub_doublets()
    # Save
    adata.obs.loc[sample_i.obs_names.tolist(), 'scr_doublet'] = predicted_doublets
    adata.obs.loc[sample_i.obs_names.tolist(), 'scr_doublet_score'] = doublet_scores
adata.obs['scr_doublet'] = adata.obs['scr_doublet'].astype(str)
# Filter for non-doublets
adata[adata.obs['scr_doublet'] == 'False', :]

# N cells per sample >100, re-filter
cells_per_sample = adata.obs['sample_id'].value_counts()
adata = adata[adata.obs.sample_id.isin(cells_per_sample[cells_per_sample>100].index),:]

# Attach batch
adata.obs = adata.obs.reset_index().merge(
    pd.read_csv('data/raw_data/sample_batch.csv'), 
    how="left", on='sample_id').set_index('index')

adata.write('data/qc/raw_data_qc.h5ad', compression='gzip')

In [None]:
adata = sc.read('data/qc/raw_data_qc.h5ad')

## All immune cells, `panImmune.h5ad`

Script used to load `scvi_model.py` code to [computer cluster](https://www.rc.ucl.ac.uk/docs/Clusters/Myriad/]).

In [None]:
%%writefile scr/pan_immune.txt

#$ -l h_rt=24:00:0 
#$ -l mem=10G 
#$ -l tmpfs=10G 
#$ -N integration
#$ -pe smp 8
#$ -l gpu=1

conda activate environment
module load python3 

#torch setup
module unload compilers mpi
module load compilers/gnu/4.9.2 
module load beta-modules
module load cuda/11.3.1/gnu-10.2.0

~/miniconda3/envs/environment/bin/python3.9 scvi_model.py \
--adata "raw_data_qc.h5ad" --run_name panImmune --scvi_batch_key batch \
--scvi_categorical_covariate_keys sample_covar chem \
--n_HVG 7000 --gene_groups_remove mt ribo ig tcr --n_latent 30 \
--n_hidden 128 --n_layers 2 --dropout_rate 0.2 --gene_likelihood nb

In [None]:
!qsub scr/pan_immune.txt

In [None]:
# Cluster labels

adata = sc.read('data/panImmune.h5ad')

labels = pd.read_csv('data/cluster_labels/panImmune_labels.csv')
labels['leiden_0.6'] = labels['leiden_0.6'].astype(str)

adata.obs=adata.obs.reset_index().merge(labels, how="left", on='leiden_0.6').set_index('index')

adata.write('panImmune.h5ad',compression='gzip')

## T cells, `Tcell.h5ad`

In [59]:
# Subset output for integration

adata = sc.read('panImmune.h5ad')

sc.read('raw_data_qc.h5ad')[adata[adata.obs.lineage=='T cell',].obs_names,].write(
    'raw_data_qc_Tcell.h5ad',compression='gzip')

In [None]:
%%writefile scr/Tcell.txt

#$ -l h_rt=24:00:0 
#$ -l mem=10G 
#$ -l tmpfs=10G 
#$ -N integration
#$ -pe smp 8
#$ -l gpu=1

conda activate environment
module load python3 

#torch setup
module unload compilers mpi
module load compilers/gnu/4.9.2 
module load beta-modules
module load cuda/11.3.1/gnu-10.2.0

~/miniconda3/envs/environment/bin/python3.9 scvi_model.py \
--adata "raw_data_qc_Tcell.h5ad" --run_name Tcell --scvi_batch_key batch \
--scvi_categorical_covariate_keys sample_covar chem \
--n_HVG 5000 --gene_groups_remove mt ribo ig tcr --n_latent 30 \
--n_hidden 128 --n_layers 2 --dropout_rate 0.2 --gene_likelihood zinb

In [3]:
!qsub scr/Tcell.txt

In [None]:
# Import integrated

adata = sc.read('Tcell.h5ad')
adata.obs['X_mde1'] = adata.obsm['X_mde'][:,0]
adata.obs['X_mde2'] = adata.obsm['X_mde'][:,1]

In [15]:
# Manual sub-clustering

subcl = {'2':0.3,'10':0.2,'27':0.1}
for cl in subcl.keys():
    sc.tl.leiden(adata, restrict_to=['leiden_2.0', [cl]], resolution=subcl[cl], key_added='leiden_2.0')

In [None]:
# Cluster labels

labels = pd.read_csv('data/cluster_labels/Tcell_labels.csv')
labels['leiden_2.0'] = labels['leiden_2.0'].astype(str)

adata.obs=adata.obs.reset_index().merge(labels, how="left", on='leiden_0.6').set_index('index')

adata.write('Tcell.h5ad',compression='gzip')