In [220]:
import os
import gzip
import numpy as np
import pandas as pd
import scanpy.api as sc
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn.preprocessing
import scipy.sparse
from anndata import AnnData

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


# Analysis for a single sample

In [None]:
%%time

wd = '/home/ndeforest/islet_snATAC/raw_data/biobank_2/'
sp = scipy.sparse.load_npz(os.path.join(wd, 'A0016_T2D.int.csr.npz'))
peaks = open(os.path.join(wd, 'A0016_T2D.peaks')).read().splitlines()
barcodes = open(os.path.join(wd, 'A0016_T2D.barcodes')).read().splitlines()
adata = AnnData(sp, {'obs_names':barcodes}, {'var_names':peaks})

promoters = pd.read_table(os.path.join('/home/joshchiou/joshchiou-data2/islet_snATAC/fresh_only/', 'hg19.5kb.promoter.txt'), sep='\t', header=None, index_col=0, names=['prom'])
promoter_names = promoters['prom'].to_dict()
adata.var.index = [promoter_names[b] if b in promoter_names else b for b in adata.var.index]
adata.var_names_make_unique(join='.')

adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log10_n_counts'] = np.log10(adata.obs['n_counts'])
adata.raw = AnnData(adata.X > 0, {'obs_names':adata.obs.index}, {'var_names':adata.var.index})

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata_filter = sc.pp.filter_genes_dispersion(adata.X, flavor='seurat', n_bins=50)
hvgs = adata.var.loc[adata_filter.gene_subset].index.tolist()

adata = adata[:,adata.var.index.isin(hvgs)]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['log10_n_counts'])
sc.pp.scale(adata)
sc.tl.pca(adata, zero_center=False, random_state=0)
sc.pp.neighbors(adata, n_neighbors=30, method='umap', metric='cosine', random_state=0, n_pcs=50)
sc.tl.louvain(adata, resolution=1.5, random_state=0)
sc.tl.umap(adata, min_dist=0.3, random_state=0)
sc.pl.umap(adata, color=['louvain'], size=49, legend_loc='on data')
sc.pl.umap(adata, color=['log10_n_counts'], size=49, color_map='Blues')

sc.pl.umap(adata, color=['INS-IGF2','GCG','SST'], size=49, color_map='Blues', use_raw=True)
sc.pl.umap(adata, color=['PPY','NKX2-3','REG1A'], size=49, color_map='Blues', use_raw=True)
sc.pl.umap(adata, color=['CFTR','PTPN22','PDGFRB'], size=49, color_map='Blues', use_raw=True)
sc.pl.umap(adata, color=['ARX','PDX1','HOXA5'], size=49, color_map='Blues', use_raw=True)

# Analysis with MNN correction for multiple samples

In [2]:
%%time

# merged dataset from 3 islet samples
wd = '/home/joshchiou/joshchiou-data2/islet_snATAC/fresh_only/'
sp = scipy.sparse.load_npz(os.path.join(wd, 'Islet_1234.5kb.int.csr.npz'))
peaks = pd.read_table(os.path.join(wd, 'Islet_1234.5kb.int.peaks'), header=None, names=['peaks'])
barcodes = pd.read_table(os.path.join(wd, 'Islet_1234.5kb.int.barcodes'), header=None, names=['barcodes'])
remove = pd.read_table(os.path.join(wd, 'Islet_123.remove'), header=None, names=['remove'])

adata = AnnData(sp, {'obs_names':barcodes['barcodes']}, {'var_names':peaks['peaks']})
adata.raw = AnnData(sp > 0, {'obs_names':barcodes['barcodes']}, {'var_names':adata.var.index})
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log10_n_counts'] = np.log10(adata.obs['n_counts'])

adata.obs['Islet1'] = adata.obs.index.str.contains('Islet1').astype(int)
adata.obs['Islet2'] = adata.obs.index.str.contains('Islet2').astype(int)
adata.obs['Islet3'] = adata.obs.index.str.contains('Islet3').astype(int)
adata.var['n_cells'] = adata.raw.X.sum(axis=0).A1

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata_filter = sc.pp.filter_genes_dispersion(adata.X, flavor='seurat', n_bins=50)
hvgs = adata.var.loc[adata_filter.gene_subset].index.tolist()
adata.var['Islet1'] = (adata.raw.X > 0)[adata.obs.index.str.contains('Islet1'),:].sum(axis=0).A1
adata.var['Islet2'] = (adata.raw.X > 0)[adata.obs.index.str.contains('Islet2'),:].sum(axis=0).A1
adata.var['Islet3'] = (adata.raw.X > 0)[adata.obs.index.str.contains('Islet3'),:].sum(axis=0).A1
hvgs = adata.var.loc[adata.var.index.isin(hvgs)]
hvgs = hvgs.loc[(hvgs['Islet1'] > 0) & (hvgs['Islet2'] > 0) & (hvgs['Islet3'] > 0)].index
adata.var = adata.var.drop(['Islet1','Islet2','Islet3'], axis=1)

sp = sp_filt = None

CPU times: user 21.4 s, sys: 7.34 s, total: 28.8 s
Wall time: 28.8 s


In [None]:
%%time

# regress out read depth
adatas = {}
for sample in ['Islet1','Islet2','Islet3']:
    adatas[sample] = adata[adata.obs.index.str.contains(sample), :]
    adatas[sample] = adatas[sample][:, adatas[sample].var.index.isin(hvgs)]
    sc.pp.normalize_per_cell(adatas[sample], counts_per_cell_after=1e4)
    sc.pp.log1p(adatas[sample])
    sc.pp.regress_out(adatas[sample], ['log10_n_counts'])

# perform MNN correction
adata_mnn = sc.pp.mnn_correct(adatas['Islet3'], adatas['Islet2'], adatas['Islet1'], k=10, batch_key='donor', index_unique=None)[0]
adata_mnn.write(os.path.join(wd, 'Islet_123.MNN_corrected.h5ad'))

In [None]:
sc.pp.scale(adata_mnn)
sc.tl.pca(adata_mnn, zero_center=True, svd_solver='arpack', random_state=0)
sc.pp.neighbors(adata_mnn, n_neighbors=30, method='umap', metric='cosine', random_state=0, n_pcs=50)
sc.tl.louvain(adata_mnn, resolution=1.5, random_state=0)
sc.tl.umap(adata_mnn, n_components=2, min_dist=0.3, random_state=0)
sc.pl.umap(adata_mnn, color=['louvain'], size=16, legend_loc='on data')
donor_map = {'0':'3', '1':'2', '2':'1'}
adata_mnn.obs['donor'] = adata_mnn.obs['donor'].map(donor_map)
# clusters projected onto UMAP
sc.pl.umap(adata_mnn, color=['donor'], size=16, alpha=.5)
# donor projected onto UMAP
sc.pl.umap(adata_mnn, color=['log10_n_counts'], size=16, color_map='Blues')

# read depth boxplot
fig, ax1 = plt.subplots(1,1,figsize=(7,5))
sns.boxplot(x='louvain', y='log10_n_counts', data=adata_mnn.obs)
plt.show()

# correlation with PCs
pc = pd.DataFrame(adata_mnn.obsm['X_pca'], index=adata_mnn.obs.index, columns=['PC{}'.format(i) for i in range(1,51)])
pc = pc.join(adata_mnn.obs[['log10_n_counts', 'log10_n_peaks', 'Islet1', 'Islet2', 'Islet3']], how='inner')
fig, ax1 = plt.subplots(1,1,figsize=(10,10))
sns.heatmap(pc.corr(), ax=ax1)
plt.show()

# marker genes projected onto UMAP
sc.pl.umap(adata_mnn, color=['INS-IGF2','GCG','SST'], size=16, color_map='Blues', use_raw=True)
sc.pl.umap(adata_mnn, color=['PPY','NKX2-3','REG1A'], size=16, color_map='Blues', use_raw=True)
sc.pl.umap(adata_mnn, color=['CFTR','PTPN22','PDGFRB'], size=16, color_map='Blues', use_raw=True)