## Preprocess data
Read adata of ATAC-seq and RNA-seq and save them as sparse data and annotations.

In [8]:
from scipy.io import mmwrite
import anndata 
import os
out_dir = '../../output'
os.makedirs(f'{out_dir}/figr/scRNA/', exist_ok=True)
os.makedirs(f'{out_dir}/figr/scATAC/', exist_ok=True)
os.makedirs(f'{out_dir}/figr/grn/', exist_ok=True)
os.makedirs(f'{out_dir}/figr/grn/', exist_ok=True)

### ATAC-seq

In [12]:
# python
adata_atac = anndata.read(f'{out_dir}/scATAC/adata_atac.h5ad')
# save sparse matrix
mmwrite(f"{out_dir}/scATAC/X_matrix.mtx", adata_atac.X)
# save annotation
annotation_peak = adata_atac.var.reset_index().location.str.split(':', expand=True)
annotation_peak.columns = ['seqname', 'ranges']
annotation_peak['strand'] = '+' 
annotation_peak.to_csv(f"{out_dir}/scATAC/annotation_peak.csv")

annotation_cells = adata_atac.obs.reset_index()
annotation_cells.to_csv(f"{out_dir}/scATAC/annotation_cells.csv")



### RNA-seq

In [7]:
adata_rna = anndata.read(f'{out_dir}/scRNA/adata_rna.h5ad')
# save sparse matrix
mmwrite(f"{out_dir}/scRNA/X_matrix.mtx", adata_rna.X)
# save annotation
annotation_gene = adata_rna.var.reset_index()
annotation_gene.to_csv(f"{out_dir}/scRNA/annotation_gene.csv")

annotation_cells = adata_rna.obs.reset_index()[['obs_id','cell_type']]
annotation_cells.to_csv(f"{out_dir}/scRNA/annotation_cells.csv")



## cell-topic probability scores 

In [32]:
import pickle
cistopic_obj = pickle.load(
            open(f'{out_dir}/scenicplus/scATAC/cistopic_obj.pkl', 'rb'))
# get cell topic association 
cell_topic = cistopic_obj.selected_model.cell_topic.T
cell_names = cistopic_obj.cell_data.obs_id.values

cell_topic.index = cell_names
# remove donor name from the index
# cell_topic.index = cell_topic.reset_index()['index'].apply(lambda name: name.split('__')[0])

In [33]:
# from pycisTopic.cistopic_class import CistopicObject
# cistopic_obj.cell_names
# cistopic_obj.cell_data.obs_id
# cell_topic

In [34]:
cell_topic.to_csv(f'{out_dir}/figr/scATAC/cell_topic.csv')

# Pipeline of FigR

In [9]:
%load_ext rpy2.ipython

In [10]:
%%R 
options(digits=5, max.print=100)  # Adjust numbers as needed
set.seed(123)

library(dplyr)
library(FNN)
library(chromVAR)
library(doParallel)
library(BuenColors)
library(FigR)
library(BSgenome.Hsapiens.UCSC.hg38)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
Loading required package: MASS

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: ggplot2
Loading required package: Matrix
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count


Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps,

## Load atac-seq and create summarizedexperiment

In [11]:
%%R 
X <- readMM(f"{out_dir}/scATAC/X_matrix.mtx")
X <- t(X)
annotation_peak <- read.csv(f"{out_dir}/scATAC/annotation_peak.csv", row.names = 1)
annotation_cells <- read.csv(f"{out_dir}/scATAC/annotation_cells.csv", row.names = 1)

# Filter out entries where seqname is 'chr10'
filter_indices <- grepl("^chr", annotation_peak$seqname)
annotation_peak_filtered <- annotation_peak[filter_indices, ]

# Filter the rows in X
X_filtered <- X[filter_indices, ]

# Create the SummarizedExperiment object with the filtered data
atac <- SummarizedExperiment(assays = list(counts = X_filtered), 
                             rowRanges = GRanges(annotation_peak_filtered$seqname,
                             IRanges(annotation_peak_filtered$ranges)), 
                             colData = DataFrame(annotation_cells))
colnames(atac) <- annotation_cells$obs_id    

dim(atac) #peaks*cells

RParsingError: Parsing status not OK - PARSING_STATUS.PARSE_ERROR

### Load RNA-seq and create sparsematrix

In [4]:
%%R 
XX <- readMM(f"{out_dir}/scRNA/X_matrix.mtx")
XX <- t(XX)
annotation_gene <- read.csv(f"{out_dir}/scRNA/annotation_gene.csv", row.names = 1)
annotation_cells <- read.csv(f"{out_dir}/scRNA/annotation_cells.csv", row.names = 1)

rna <- as(XX, "CsparseMatrix")
rownames(rna) <- annotation_gene$location
colnames(rna) <- annotation_cells$obs_id

# Remove genes with zero expression across all cells
rna <- rna[Matrix::rowSums(rna)!=0,]

dim(rna) # genes*cells

[1] 22787 25551


## load cell topic probabilities and create cell-cluster matrix


In [5]:
%%R 
n_topics = 48
cell_topic <- read.csv(f'{out_dir}/figr/scATAC/cell_topic.csv', row.names = 1)
print(dim(cell_topic))
# Derive cell kNN using this
cellkNN <- get.knn(cell_topic, k=n_topics)$nn.index
rownames(cellkNN) <- rownames(cell_topic)
print(dim(cellkNN))

[1] 25551    48
[1] 25551    48


## Save data

In [7]:
%%R 
n_cells_to_keep = dim(rna)[2]
cellsToKeep <- sample(colnames(atac), size = n_cells_to_keep, replace = FALSE)
atac_short = atac[, cellsToKeep]
rna_short = rna[, cellsToKeep]
cellkNN_short = cellkNN[cellsToKeep,]

saveRDS(atac_short, f'{out_dir}/scATAC/atac.rds')
saveRDS(rna_short, f'{out_dir}/scRNA/rna.rds')
saveRDS(cellkNN_short, f'{out_dir}/figr/cellkNN.rds')


## Peak-gene association testing

In [8]:
%%R 
options(digits=5, max.print=100)  # Adjust numbers as needed
set.seed(123)

library(dplyr)
library(FNN)
library(chromVAR)
library(doParallel)
library(BuenColors)
library(FigR)
library(BSgenome.Hsapiens.UCSC.hg38)


if (!dir.exists(f"{out_dir}/figr/grn/")) {
  dir.create(f"{out_dir}/figr/grn/", recursive = TRUE)
}

atac_short = readRDS(f'{out_dir}/scATAC/atac.rds')
rna_short  = readRDS(f'{out_dir}/scRNA/rna.rds')
cisCorr <- FigR::runGenePeakcorr(ATAC.se = atac_short,
                           RNAmat = rna_short,
                           genome = "hg38", # One of hg19, mm10 or hg38 
                           nCores = 40,
                           p.cut = NULL, # Set this to NULL and we can filter later
                           n_bg = 100)
write.csv(cisCorr, f"{out_dir}/figr/grn/cisCorr.csv", row.names = TRUE)

Matrix object input detectedCentering counts for cells sequentially in groups of size  1000  ..

Computing centered counts for cells:  1  to  1000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  1001  to  2000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  2001  to  3000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  3001  to  4000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  4001  to  5000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  5001  to  6000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  6001  to  7000 ..
Computing centered counts per cell using mean reads in features ..

Computing centered counts for cells:  

Exception ignored from cffi callback <function _consolewrite_ex at 0x7f01713f40d0>:
Traceback (most recent call last):
  File "/root/anaconda3/envs/scenicplus/lib/python3.8/site-packages/rpy2/rinterface_lib/callbacks.py", line 130, in _consolewrite_ex
    @ffi_proxy.callback(ffi_proxy._consolewrite_ex_def,
KeyboardInterrupt: 


Finished!

Time Elapsed:  7.81739735603333 secs 

Computing background correlations ..

In [None]:
%%R 
cisCorr = read.csv(f"{out_dir}/figr/grn/cisCorr.csv")
cisCorr.filt <- cisCorr %>% filter(pvalZ <= 0.05)

dorcGenes <- dorcJPlot(dorcTab = cisCorr.filt,
                         cutoff = 10, # No. sig peaks needed to be called a DORC
                         labelTop = 20,
                         returnGeneList = TRUE, # Set this to FALSE for just the plot
                         force=2)
# Unfiltered
numDorcs <- cisCorr.filt %>% group_by(Gene) %>% tally() %>% arrange(desc(n))
# numDorcs

## Create DORCs and smooth them 

In [None]:
%%R 
allGenes = unique(cisCorr.filt$Gene) 
dorcMat <- getDORCScores(ATAC.se = atac_short, # Has to be same SE as used in previous step
                         dorcTab = cisCorr.filt,
                         geneList = allGenes,
                         nCores = 4)

In [None]:
%%R 
cellkNN = readRDS(f'{out_dir}/figr/cellkNN.rds')
# Smooth dorc scores using cell KNNs (k=n_topics)
dorcMat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = dorcMat,nCores = 4)

# Smooth RNA using cell KNNs
# This takes longer since it's all genes
RNAmat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics],mat = rna_short,nCores = 4)

In [None]:
%%R # save it as we will need it for TF gene associations 
write.csv(cisCorr.filt, f'{out_dir}/figr/grn/cisCorr.filt.csv')
saveRDS(RNAmat.s, f'{out_dir}/figr/grn/RNAmat.s.RDS')
saveRDS(dorcMat.s, f'{out_dir}/figr/grn/dorcMat.s.RDS')

## TF-gene associations
Do not run this locally.

In [None]:
%%R 
cisCorr.filt = read.csv(f'{out_dir}/figr/grn/cisCorr.filt.csv')
RNAmat.s = readRDS(f'{out_dir}/figr/grn/RNAmat.s.RDS')
dorcMat.s = readRDS(f'{out_dir}/figr/grn/dorcMat.s.RDS')
atac_short = readRDS(f'{out_dir}/figr/grn/atac_short.rds')
figR.d <- runFigRGRN(ATAC.se = atac_short, # Must be the same input as used in runGenePeakcorr()
                     dorcTab = cisCorr.filt, # Filtered peak-gene associations
                     genome = "hg38",
                     dorcMat = dorcMat.s,
                     rnaMat = RNAmat.s, 
                     nCores = 30)
write.csv(figR.d, 'figR.d.csv')

## Evaluation of results

In [None]:
%%R 
figR.d = read.csv(f'{out_dir}/figr/grn/figR.d.csv')

figR.d %>% 
  ggplot(aes(Corr.log10P,Enrichment.log10P,color=Score)) + 
  ggrastr::geom_point_rast(size=0.01,shape=n_topics) + 
  theme_classic() + 
  scale_color_gradientn(colours = jdb_palette("solar_extra"),limits=c(-3,3),oob = scales::squish,breaks=scales::breaks_pretty(n=3))

In [None]:
%%R 
rankDrivers(figR.d,rankBy = "meanScore",interactive = FALSE)

## Process the results and output final GRN

In [None]:
import pandas as pd
import numpy as np

### GRN 

In [None]:
figr_grn = pd.read_csv(f'{out_dir}/figr/grn/figR.d_all.csv', index_col=0)
# filter based on enrichment 
figr_grn = figr_grn[figr_grn['Enrichment.P']<0.05]
# filter bsaed on correlatoon
figr_grn = figr_grn[figr_grn['Corr.P']<0.05]
# filter thoes that are 0 score 
figr_grn = figr_grn[figr_grn.Score!=0]
# subset columns
figr_grn = figr_grn[['Motif', 'DORC', 'Score']]
figr_grn = figr_grn.reset_index(drop=True)
figr_grn.columns = ['source', 'target','weight']
figr_grn.to_csv(f'{out_dir}/FigR/grn.csv')

### Peak-gene

In [None]:
# load peak-gene association after filtering for significant
peak_gene_figr = pd.read_csv(f'{out_dir}/figr/grn/cisCorr.filt.csv', index_col=0)
peak_gene_figr_n = peak_gene_figr.groupby('Gene').apply(lambda df:df['PeakRanges'].shape[0])
np.max(peak_gene_figr_n.values), np.median(peak_gene_figr_n.values)
peak_gene_figr.to_csv(f'{out_dir}/figr/grn/peak_gene.csv')
print('In the peak-gene associations: number of  CIS ', peak_gene_figr.PeakRanges.unique().shape[0], ', gene ', peak_gene_figr.Gene.unique().shape[0])
print('number of DORC genes ', (peak_gene_figr_n.values >= 5).sum())