In [None]:
import os
import numpy as np
import pandas as pd
import scipy.io

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# single cell
import scanpy.api as sc
from anndata import AnnData

# etc
%load_ext rpy2.ipython

### Regress out read depth per experiment

In [None]:
wd = 'T1D_snATAC/'

adatas = {}
samples = ['nPOD-6282','nPOD-6251-1','nPOD-6251-2','nPOD-6004-1','nPOD-6004-2','nPOD-6007',
           'IIDP-AFC2208-1','IIDP-AFC2208-2','IIDP-AFEA331','IIDP-AFEP022','IIDP-AEHU156',
           'HemaCare-D205220','HemaCare-D105','HemaCare-D147558','HemaCare-D270271','HemaCare-D182364','HemaCare-D273097',
           '10x-5k-1','10x-5k-2','10x-5k-3','10x-10k-1','10x-10k-2','10x-10k-3']

# dictionary naming 5kb windows genome-wide based on overlap with gencode v19 gene TSS
promoters = pd.read_csv(os.path.join('references', 'gencode.v19.5kb_windows.promoter_names.txt.gz'), sep='\t', header=None, index_col=0, names=['prom'])
promoter_names = promoters['prom'].to_dict() 

# cells from low quality and doublet clusters were identified through iterative clustering
low_frip = open(os.path.join(wd, 'T1D_snATAC.lowfrip')).read().splitlines()
low_reads = open(os.path.join(wd, 'T1D_snATAC.lowreads')).read().splitlines()
doublets = open(os.path.join(wd, 'T1D_snATAC.doublets')).read().splitlines()

qc_metrics = pd.read_csv(os.path.join(wd, 'T1D_snATAC.qc_metrics.txt'), sep='\t', header=0, index_col=0)
hvw = open(os.path.join(wd,'T1D_snATAC.hvw')).read().splitlines()

for sample in samples:
    sp = scipy.io.mmread(os.path.join(wd, sample, '{}.mtx.gz'.format(sample))).tocsr()
    regions = open(os.path.join(wd, sample, '{}.regions'.format(sample))).read().splitlines()
    barcodes = open(os.path.join(wd, sample, '{}.barcodes'.format(sample))).read().splitlines()
    adatas[sample] = AnnData(sp, {'obs_names':barcodes}, {'var_names':regions})
    adatas[sample].var.index = [promoter_names[b] if b in promoter_names else b for b in adatas[sample].var.index]
    adatas[sample].var_names_make_unique(join='.')
    
    adatas[sample] = adatas[sample][~adatas[sample].obs.index.isin(low_frip + low_reads + doublets),:].copy()
    adatas[sample].obs = adatas[sample].obs.join(qc_metrics, how='inner')
    adatas[sample].obs['experiment'] = [i.split('_')[0] for i in adatas[sample].obs.index]
    raw = adatas[sample].copy()
    
    sc.pp.normalize_per_cell(adatas[sample], counts_per_cell_after=1e4)
    adatas[sample] = adatas[sample][:, adatas[sample].var.index.isin(hvgs)]
    sc.pp.log1p(adatas[sample])
    adatas[sample].obs['log_usable_counts'] = np.log(raw[:, raw.var.index.isin(hvgs)].X.sum(axis=1).A1)
    sc.pp.regress_out(adatas[sample], ['log_usable_counts'])
    adatas[sample].write(os.path.join(wd, '{}.norm.h5ad'.format(sample)))
    
    sc.pp.normalize_per_cell(raw, counts_per_cell_after=1e4)
    sc.pp.log1p(raw)
    raw.write(os.path.join(wd, '{}.raw.h5ad'.format(sample)))


### Merge files from all samples

In [None]:
adatas = {}
adatas_raw = {}
samples = ['nPOD-6282','nPOD-6251-1','nPOD-6251-2','nPOD-6004-1','nPOD-6004-2','nPOD-6007',
           'IIDP-AFC2208-1','IIDP-AFC2208-2','IIDP-AFEA331','IIDP-AFEP022','IIDP-AEHU156',
           'HemaCare-D205220','HemaCare-D105','HemaCare-D147558','HemaCare-D270271','HemaCare-D182364','HemaCare-D273097',
           '10x-5k-1','10x-5k-2','10x-5k-3','10x-10k-1','10x-10k-2','10x-10k-3']
for sample in samples:
    adatas[sample] = sc.read_h5ad(os.path.join(wd, '{}.norm.h5ad'.format(sample)))
    adatas_raw[sample] = sc.read_h5ad(os.path.join(wd, '{}.raw.h5ad'.format(sample)))
    
adata_norm = AnnData.concatenate(adatas['nPOD-6282'], adatas['nPOD-6251-1'], adatas['nPOD-6251-2'], adatas['nPOD-6004-1'], adatas['nPOD-6004-2'], adatas['nPOD-6007'], 
                                 adatas['IIDP-AFC2208-1'], adatas['IIDP-AFC2208-2'], adatas['IIDP-AFEA331'], adatas['IIDP-AFEP022'], adatas['IIDP-AEHU156'],
                                 adatas['HemaCare-D205220'], adatas['HemaCare-D105'], adatas['HemaCare-D147558'], adatas['HemaCare-D270271'], adatas['HemaCare-D182364'], adatas['HemaCare-D273097'],
                                 adatas['10x-5k-1'], adatas['10x-5k-2'], adatas['10x-5k-3'], adatas['10x-10k-1'], adatas['10x-10k-2'], adatas['10x-10k-3'],
                                 batch_key='norm', index_unique=None)
adata_norm_raw = AnnData.concatenate(adatas_raw['nPOD-6282'], adatas_raw['nPOD-6251-1'], adatas_raw['nPOD-6251-2'], adatas_raw['nPOD-6004-1'], adatas_raw['nPOD-6004-2'], adatas_raw['nPOD-6007'],
                                     adatas_raw['IIDP-AFC2208-1'], adatas_raw['IIDP-AFC2208-2'], adatas_raw['IIDP-AFEA331'], adatas_raw['IIDP-AFEP022'], adatas_raw['IIDP-AEHU156'],
                                     adatas_raw['HemaCare-D205220'], adatas_raw['HemaCare-D105'], adatas_raw['HemaCare-D147558'], adatas_raw['HemaCare-D270271'], adatas_raw['HemaCare-D182364'], adatas_raw['HemaCare-D273097'],
                                     adatas_raw['10x-5k-1'], adatas_raw['10x-5k-2'], adatas_raw['10x-5k-3'], adatas_raw['10x-10k-1'], adatas_raw['10x-10k-2'], adatas_raw['10x-10k-3'],
                                     batch_key='norm', index_unique=None)
adata_norm.raw = adata_norm_raw.copy()

sc.pp.scale(adata_norm)
sc.tl.pca(adata_norm, zero_center=True, svd_solver='arpack', random_state=0)
pc = pd.DataFrame(adata_norm.obsm['X_pca'], columns=['PC{}'.format(i) for i in range(1,51)], index=adata_norm.obs.index)
metadata = pd.read_csv(os.path.join(wd, 'T1D_snATAC.metadata.txt'), sep='\t', header=0, index_col=0)
metadata = metadata.loc[pc.index]

### Run Harmony (rpy2) to correct for batch effects

In [None]:
%%R -i pc -i metadata -o harmonized
library(harmony)
library(magrittr)

# run Harmony on the PCs
harmonized <- HarmonyMatrix(pc, metadata, c('donor','sex','technology'), do_pca=FALSE)
harmonized <- data.frame(harmonized)

### Plot cluster based on corrected components

In [None]:
adata_norm.obsm['X_pca'] = harmonized.values
sc.pp.neighbors(adata_norm, n_neighbors=30, method='umap', metric='cosine', random_state=0, n_pcs=50)
sc.tl.leiden(adata_norm, resolution=1.5, random_state=0)
sc.tl.umap(adata_norm, min_dist=0.3, random_state=0)

sc.settings.set_figure_params(dpi=100)
sc.pl.umap(adata_norm, color=['leiden'], size=9, legend_loc='on data')
sc.pl.umap(adata_norm, color=['experiment'], size=1, alpha=.5)

# plot quality metrics
sc.pl.umap(adata_norm, color=['log_usable_counts'], size=9, color_map='Blues')
sc.pl.umap(adata_norm, color=['frac_reads_in_peaks','frac_reads_in_promoters','frac_promoters_used'], cmap='Reds', size=9, legend_loc='on data', title=['Frac. reads in peaks', 'Frac. reads in promoters', 'Frac. promoters used'])

# 5kb windows overlapping marker promoters    
sc.pl.umap(adata_norm, color=['INS-IGF2','GCG','SST'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['PPY','REG1A','CFTR'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['PDGFRB','CLDN5','C1QB'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['CD3D','CD4','FOXP3'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['CD8B','IFNG','NCR1'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['MS4A1','TCL1A','FOXP3'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['GATA1','PTCRA','IL1B'], size=9, color_map='Blues', frameon=True, use_raw=True)

In [None]:
# More 5kb windows overlapping marker promoters    
sc.pl.umap(adata_norm, color=['INS-IGF2','IAPP','SIX2'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['GCG','FEV','GATA6'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['SST','HHEX','SALL1'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['PPY','SUCNR1','CDH20'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['CFTR','6:51940000-51945000','4:143470000-143475000'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['REG1A','PTF1A','PRSS1'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['TCEA2','CLEC14A','ROBO4'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['C1QA','C1QB','C1QC'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['CD69','BCL11B','STK17B'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['PDGFRB','SPARC','COL6A3'], size=9, color_map='Blues', frameon=True, use_raw=True)
sc.pl.umap(adata_norm, color=['CD14','IL1B','MAFB'], size=9, color_map='Blues', frameon=True, use_raw=True) # Monocyte
sc.pl.umap(adata_norm, color=['CD4','CD3D','IL2RA'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # CD4 T-cell
sc.pl.umap(adata_norm, color=['CD8A','NFIC','FASLG'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # CD8 T-cell
sc.pl.umap(adata_norm, color=['CD19','MS4A1','CD79A'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) #B-cell 
sc.pl.umap(adata_norm, color=['NCAM1','PRF1','SH2D1B'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # NK cell
sc.pl.umap(adata_norm, color=['ITGA2B','PTGER3','CD9'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # Megakaryocyte
sc.pl.umap(adata_norm, color=['PTCRA','LAMP5','RGS7'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # pDC, FLT3 
sc.pl.umap(adata_norm, color=['CXCR3','B3GNT6','SDPR'], size=9, color_map='Blues', frameon=True, use_raw=True, vmax=2) # monocyte-derived DC, LAMP2


### Subclustering at high resolution to identify potential doublet subclusters

In [None]:
subset_cluster = ['0']
sc.tl.louvain(adata_norm, restrict_to=('leiden',subset_cluster), resolution=3, random_state=0, key_added='subset')
sc.pl.umap(adata_norm, color=['subset'], size=9)

fig, ax1 = plt.subplots(1,1,figsize=(5,5))
subset = adata_norm.obs.join(pd.DataFrame(adata_norm.obsm['X_umap'], index=adata_norm.obs.index, columns=['UMAP1','UMAP2']), how='inner')
subset = subset.loc[subset['leiden'].isin(subset_cluster)]
for s in sorted(set(subset['subset'])):
    ax1.scatter(subset.loc[subset['subset']==s, 'UMAP1'], subset.loc[subset['subset']==s, 'UMAP2'], alpha=1, s=4, label=s)
ax1.legend(markerscale=3)
plt.show()

# plot qc metrics including subclusters
for qc_metric in ['log10_usable_counts', 'frac_reads_in_peaks', 'frac_promoters_used']:
    fig, ax1 = plt.subplots(1,1,figsize=(7,5))
    sns.boxplot(x='subset', y=qc_metric, data=adata_norm.obs, ax=ax1)
    ax1.axhline(adata_norm.obs[qc_metric].median(), color='black', ls='dotted')
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
    plt.show()

# check marker promoters for potential doublet subclusters
sc.pl.dotplot(adata_norm, ['INS-IGF2','GCG','SST','PPY','REG1A','CFTR','PDGFRB','CLDN5','C1QB',
                           'CD3D','CD4','FOXP3','CD8B','IFNG','NCR1','MS4A1','TCL1A','FOXP3','GATA1','PTCRA','IL1B'],
              standard_scale='var', groupby='subset', dendrogram=False, use_raw=True)
    
adata_norm.obs.loc[adata_norm.obs['subset'].isin(['0,28'])].to_csv(os.path.join(wd, '{}.acinar.doublets'.format(sample_name)), header=False, columns=[])