***Import libraires and data, set up stuff***

In [None]:
import scanpy as sc
import pandas as pd
import scvelo as scv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import anndata as ad


In [None]:
import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
plt.rcParams['figure.figsize']=(6,6) #rescale figures

In [None]:
sample_1 = sc.read('../../../Data/notebooks_data/crypto_1.filt.h5ad')
sample_2 = sc.read('../../../Data/notebooks_data/crypto_2.filt.h5ad')
sample_3 = sc.read('../../../Data/notebooks_data/crypto_3.filt.h5ad')

In [None]:
batch_names = ['SAM_1','SAM_2','SAM_3']
sample = ad.AnnData.concatenate(sample_1, sample_2, sample_3)
sample.rename_categories(key='batch', categories=batch_names)
scv.utils.cleanup(sample, clean='var')

In [None]:
del sample_1, sample_2, sample_3

In [None]:
sample

In [None]:
sc.preprocessing.filter_genes(sample, min_cells=30)

In [None]:
sample

In [None]:
rawMatrix = np.array( sample.layers['umi_raw'].T.copy())
genes_name = sample.var_names
cells_info = sample.obs[ ['batch'] ].copy()

In [None]:
rawMatrix

In [None]:
%%R -i cells_info -i rawMatrix -i genes_name
library(scater)
cell_df <- DataFrame(data = cells_info)
colnames(rawMatrix) <- rownames(cell_df) #cell names
rownames(rawMatrix) <- genes_name #gene names

In [None]:
%%R
library(sctransform)
library(future)
future::plan(strategy = 'multicore', workers = 2)
options(future.globals.maxSize = 50 * 1024 ^ 3)

In [None]:
%%R
names(cell_df)

In [None]:
%%R
vst_out=vst( as.matrix(rawMatrix), cell_attr=cell_df, n_genes=3000,
            batch_var='data.batch', method='qpoisson',
            show_progress=TRUE, min_cells=30,
            return_corrected_umi=TRUE)

In [None]:
%%R -o new_matrix -o sct_genes -o all_genes -o umi_matrix
new_matrix=vst_out$y
sct_genes = rownames(vst_out$model_pars)
all_genes = rownames(new_matrix)
umi_matrix=vst_out$umi_corrected

In [None]:
sct_genes = list(sct_genes)
sample.var['highly_variable'] = [i in sct_genes for i in sample.var_names]

In [None]:
sample = sample[:,list(all_genes)].copy()

In [None]:
sample.layers['norm_sct'] = np.transpose( new_matrix )

In [None]:
sample.layers['umi_sct'] = np.transpose( umi_matrix )

In [None]:
sample.X = sample.layers['norm_sct'].copy()
sc.pp.scale(sample)
sc.preprocessing.pca(sample, svd_solver='arpack', random_state=12345)
sc.pl.pca(sample, color=['batch','total_counts'])
import bbknn as bbknn
bbknn.bbknn(sample)
sc.tools.umap(sample, random_state=54321)
sc.plotting.umap(sample, color=['batch','total_counts'])

In [None]:
data = np.array( sample.obsm['X_umap'] )
batch = np.array( sample.obs['batch'] )

In [None]:
%%R -i batch -i data

library(kBET)
library(ggplot2)

batch.estimate <- kBET( data, batch, plot=TRUE, k0=10 )
plot.data <- data.frame(class=rep(c('observed', 'expected'), 
                                  each=length(batch.estimate$stats$kBET.observed)), 
                        data =  c(batch.estimate$stats$kBET.observed,
                                  batch.estimate$stats$kBET.expected))

In [None]:
sample.write('../../../Data/notebooks_data/crypto_123.filt.norm.h5ad')

In [None]:
sample = sc.read('../../../Data/notebooks_data/crypto_123.filt.norm.h5ad')

In [None]:
import sklearn.preprocessing
import numpy as np
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(sample.obs['batch'])
batch_onehot = label_binarizer.transform(sample.obs['batch'])

In [None]:
ctl = {"maxIter":30, "eps":1e-3, "optimizeTheta":True}
sample_glmpca = sample[:,sample.var['highly_variable']].copy()
Y = sample_glmpca.layers['umi_sct'].T.todense().copy()
Y = np.asarray(Y)
from glmpca import glmpca
print("calculating")
res = glmpca.glmpca(Y, 15, penalty=1, X=batch_onehot, verbose=True, ctl=ctl)
factors = res["factors"]
sample_glmpca.obsm['X_glmpca']=factors
sample_glmpca.obsm['X_pca']=factors
sample.obsm['X_glmpca'] = sample_glmpca.obsm['X_glmpca'].copy()



In [None]:
sample.obsm['X_pca'] = sample.obsm['X_glmpca'].copy()

In [None]:
import bbknn as bbknn
bbknn.bbknn(sample)
sc.tools.umap(sample, random_state=54321)
sc.plotting.umap(sample, color=['batch','total_counts'])

In [None]:
sample.write('../../../Data/notebooks_data/crypto_123.filt.norm.red.h5ad')

In [None]:
data = np.array( sample.obsm['X_umap'] )
batch = np.array( sample.obs['batch'] )

In [None]:
%%R -i batch -i data

library(kBET)
library(ggplot2)

batch.estimate <- kBET( data, batch, plot=TRUE, k0=10 )
plot.data <- data.frame(class=rep(c('observed', 'expected'), 
                                  each=length(batch.estimate$stats$kBET.observed)), 
                        data =  c(batch.estimate$stats$kBET.observed,
                                  batch.estimate$stats$kBET.expected))

In [None]:
sample.X = sample.layers['umi_raw'].copy()
sc.pp.log1p(sample)
sc.pp.normalize_total(sample)
sc.pp.scale(sample)
sc.pp.pca(sample, svd_solver='arpack', random_state=12345)
sc.pp.neighbors(sample)
sc.tools.umap(sample, random_state=54321, n_components=2)

In [None]:
sc.pl.umap(sample, color=['batch'])

In [None]:
non_integrated_data = np.array( sample.obsm['X_umap'] )
batch = np.array( sample.obs['batch'] )

In [None]:
%%R -i batch -i non_integrated_data

library(kBET)
library(ggplot2)

batch.estimate <- kBET( non_integrated_data, batch, plot=TRUE, k0=10 )
plot.data <- data.frame(class=rep(c('observed', 'expected'), 
                                  each=length(batch.estimate$stats$kBET.observed)), 
                        data =  c(batch.estimate$stats$kBET.observed,
                                  batch.estimate$stats$kBET.expected))