In [21]:
suppressPackageStartupMessages(library(cicero))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(parallel))

In [5]:
clus <- makeCluster(32)
#clusterExport(clus)

In [6]:
wd = '/nfs/lab/projects/pbmc_snATAC/pipeline/snATAC/sample/PBMC1/lab_pipeline/cicero'
setwd(wd)

In [56]:
zz   = gzfile('pbmc1.merged_peaks.long_fmt.mtx.gz','rt')   ### long for matrix peak x barcode
data = read.table(zz,header=T,  sep="\t")

In [57]:
### convert to sparse matrix peak x barcode
sc.data <- with(data, sparseMatrix(i=as.numeric(peak), j=as.numeric(barcode), 
                                   x=value, dimnames=list(levels(peak), levels(barcode))))
rownames(sc.data) <- paste0('chr', gsub('-','_', gsub(':','_',rownames(sc.data))))


In [58]:
sc.umap <- read.table( 'pbmc1.cluster_labels.txt', sep='\t', header=T, row.names=1) ### map barcode to cluster 

In [59]:
cluster_peaks = read.table("pbmc1.sorted.merged.bed") ### peaks called in each cluster to convert as wide format

mat   = str_split_fixed( cluster_peaks[,4], "\\,", length(unique(sc.umap$cluster)))
mat2  = t( apply( mat, 1, function(x) as.numeric(unique(sc.umap$cluster) %in% x)))

peaks = cbind(paste(cluster_peaks[,1], cluster_peaks[,2],  cluster_peaks[,3], sep="_"), mat2)
colnames(peaks) = c("peak", unique(as.character(sc.umap$cluster)))

In [60]:
clusters <- unique(sc.umap$cluster)

In [99]:
for (cluster in clusters) {
    sc.umap.subset <- sc.umap[sc.umap[['cluster']]==cluster,]
    sc.data.subset <- sc.data[ rownames(sc.data) %in% peaks[peaks[, cluster]>0,'peak'], ]
    sc.data.subset <- sc.data.subset[ ,colnames(sc.data.subset) %in% rownames(sc.umap.subset)] > 0
    
    cellinfo <-data.frame(cells=colnames(sc.data.subset))
    row.names(cellinfo) <- cellinfo$cells
    dhsinfo <- data.frame(site_name=rownames(sc.data.subset))
    row.names(dhsinfo) <- dhsinfo$site_name
    dhsinfo <- cbind(dhsinfo, stringr::str_split_fixed(dhsinfo$site_name, "_", 3))
    names(dhsinfo) <- c('site_name','chr','bp1','bp2')
    dhsinfo$chr <- gsub('chr','', dhsinfo$chr)
    dhsinfo$bp1 <- as.numeric(as.character(dhsinfo$bp1))
    dhsinfo$bp2 <- as.numeric(as.character(dhsinfo$bp2))
    
    input_cds <- suppressWarnings(newCellDataSet(as(sc.data.subset, 'dgCMatrix'),
                                phenoData = methods::new('AnnotatedDataFrame', data = cellinfo),
                                featureData = methods::new('AnnotatedDataFrame', data = dhsinfo),
                                expressionFamily=negbinomial.size(),
                                lowerDetectionLimit=0))
    input_cds@expressionFamily <- binomialff()
    input_cds@expressionFamily@vfamily <- 'binomialff'
    input_cds <- detectGenes(input_cds)
    input_cds <- estimateSizeFactors(input_cds)
    
    input_cds <- input_cds[fData(input_cds)$num_cells_expressed > 0,]
    umap_coords <- sc.umap[colnames(sc.data.subset), c('UMAP1','UMAP2')]
    colnames(umap_coords) <- NULL
    
    cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords, k=30)
    window <- 1e6
    data('human.hg19.genome')
    distance_parameters <- estimate_distance_parameter(cicero_cds, window=window, maxit=100, sample_num=100, distance_constraint=500000, genomic_coords=human.hg19.genome)
    mean_distance_parameter <- mean(unlist(distance_parameters))
    cicero_out <- generate_cicero_models(cicero_cds, distance_parameter=mean_distance_parameter, window=window, genomic_coords=human.hg19.genome)
    conns <- assemble_connections(cicero_out, silent=FALSE)
    saveRDS(conns, file.path(wd, paste0('pbmc1.', cluster, '.1MB_cicero_conns.rds')))
    write.table(conns, file.path(wd, paste0('pbmc1.', cluster, '.cicero_conns.txt')), sep='\t', quote=FALSE, row.names=FALSE)

## this step is to remove duplicated connections
conns = conns[order(-conns$coaccess),]
bed = cbind(str_split_fixed(conns[,1], "\\_", 3 ), str_split_fixed(conns[,2], "\\_", 3 ))
ord = matrix(parRapply(clus, bed, function(x) x[order(x)] ), ncol=6, byrow=T)
ord = cbind(ord[, c(5,1:2,5,3:4)], conns$coaccess)
dedup = ord[!duplicated(ord),]
dedup = data.frame( Peak1 = paste(dedup[,1], dedup[,2], dedup[,3], sep="_")  , 
                    Peak2 = paste(dedup[,4], dedup[,5], dedup[,6], sep="_") , coaccess = dedup[,7]  )
              
              
  write.table(dedup, file.path(wd, paste0('pbmc1.', cluster, '.cicero_conns_dedup.txt')), sep='\t', quote=FALSE, row.names=FALSE)
             
              
}

Overlap QC metrics:
Cells per bin: 30
Maximum shared cells bin-bin: 26
Mean shared cells bin-bin: 1.23855504903992
Median shared cells bin-bin: 0


[1] "Successful cicero models:  4554"
[1] "Other models: "

Zero or one element in range 
                        1789 
[1] "Models with errors:  0"


In [17]:
getwd()

In [18]:
lu = read.table("/nfs/lab/projects/lung_snATAC/analysis/cicero/lung.all_peaks.anno.bed")

In [20]:
table(lu$V4)


      Alveolar_type_1       Alveolar_type_2  Arterial_endothelial 
                90652                103999                 68040 
               B_cell Capillary_endothelial       CD34_fibroblast 
                41335                 62210                 72467 
             Ciliated Lymphatic_endothelial          Macrophage_1 
                77439                 42404                133635 
         Macrophage_2     Matrix_fibroblast         Myofibroblast 
                86296                 89876                 57882 
              NK_cell              Pericyte                T_cell 
                35904                 56726                 44441 

In [22]:
wd = '/nfs/lab/projects/lung_snATAC/analysis/cicero/'
setwd(wd)

In [23]:
zz   = gzfile('lung.merged_peaks.long_fmt.mtx.gz','rt')   ### long for matrix peak x barcode
data = read.table(zz,header=T,  sep="\t")