# Write Peak Matrix

Takes Snap files, filters barcodes, joins them and then writes out to a sparse MM file.

In [42]:
library(SnapATAC)
library(GenomicRanges)
library(ggplot2) 
library(scales)
library(RColorBrewer)

# https://github.com/r3fang/SnapATAC/tree/master/examples/10X_snATAC
# https://github.com/r3fang/SnapATAC/tree/master/examples/10X_brain_5k

In [93]:
DAYS = c("D0", "D2", "D4", "D6", "D8", "D10", "D12", "D14")
FILE_PREFIX = "/srv/scratch/surag/scATAC-reprog/snap_smallpeaks_idr/"

file.list = c()

for (d in DAYS) {
    file.list = c(file.list, sprintf("%s/%s.snap", FILE_PREFIX, d))
}
DAYS
file.list

In [94]:
x.sp.ls = lapply(seq(file.list), function(i) {
    x.sp = createSnap(file = file.list[i],                      
                     sample = DAYS[i])
    x.sp
})

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...

Epoch: reading the barcode session ...



In [95]:
# merge timepoints
x.sp = Reduce(snapRbind, x.sp.ls);

In [96]:
x.sp

number of barcodes: 90765
number of bins: 0
number of genes: 0
number of peaks: 0
number of motifs: 0

In [97]:
# load metadata
metadata = read.table("./sessions/20200520_n68916/metadata.tsv", header=T)
rownames(metadata) = paste(metadata$sample, metadata$barcode, sep='_')
head(metadata)
dim(metadata)

Unnamed: 0_level_0,barcode,sample,umap1,umap2
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>
D0_AAACAACGACGATTAC,AAACAACGACGATTAC,D0,6.565758,-8.649868
D0_AAACAACGAGCGCACT,AAACAACGAGCGCACT,D0,5.604522,-7.835411
D0_AAACACGCTACGCCAC,AAACACGCTACGCCAC,D0,5.130516,-7.593246
D0_AAACACGCTAGTACGG,AAACACGCTAGTACGG,D0,4.926843,-8.365463
D0_AAACACGCTCCTCAAA,AAACACGCTCCTCAAA,D0,6.056337,-5.647577
D0_AAACACGCTCGAGTGA,AAACACGCTCGAGTGA,D0,3.408866,-9.058377


In [98]:
sum(paste(x.sp@sample, x.sp@barcode, sep='_') %in% rownames(metadata))

In [99]:
x.sp = x.sp[paste(x.sp@sample, x.sp@barcode, sep='_') %in% rownames(metadata), ]

In [100]:
x.sp

number of barcodes: 68916
number of bins: 0
number of genes: 0
number of peaks: 0
number of motifs: 0

In [101]:
all(paste(x.sp@sample, x.sp@barcode, sep='_') == rownames(metadata))
# reorder x.sp to match metadata order

In [102]:
# How to get the order in which a list A can be transformed to a list B 
# When both have same unique elements in them
# o(A)   o(B) o(o(B))   o(A)[o(o(B))] <---- that's what you need to do
# 3   ->  4     3           2
# 4   ->  2     2           4 
# 2   ->  1     4           1
# 1   ->  3     1           3

In [103]:
xsp_sample_barcode = paste(x.sp@sample, x.sp@barcode, sep='_')
reorder = order(xsp_sample_barcode)[order(order(rownames(metadata)))]
all(xsp_sample_barcode[reorder]==rownames(metadata))

In [104]:
x.sp = x.sp[reorder, ]
x.sp

number of barcodes: 68916
number of bins: 0
number of genes: 0
number of peaks: 0
number of motifs: 0

In [105]:
all(paste(x.sp@sample, x.sp@barcode, sep='_') == rownames(metadata))

In [106]:
# load peak matrix
x.sp = addPmatToSnap(x.sp, num.cores=10)

Epoch: reading cell-peak count matrix session ...



In [107]:
x.sp

number of barcodes: 68916
number of bins: 0
number of genes: 0
number of peaks: 524135
number of motifs: 0

## Remove Blacklisted Regions

In [108]:
black_list = read.table("../../../resources/blacklist/GRch38_unified_blacklist.bed");
black_list.gr = GRanges(
    black_list[,1], 
    IRanges(black_list[,2], black_list[,3])
  );

In [109]:
idy = queryHits(findOverlaps(x.sp@peak, black_list.gr));

In [110]:
if(length(idy) > 0){x.sp = x.sp[,-idy, mat="pmat"]};

In [111]:
x.sp

number of barcodes: 68916
number of bins: 0
number of genes: 0
number of peaks: 523984
number of motifs: 0

In [112]:
# check if reads at OCT promoter (very few or none in initial version)
sum(x.sp@pmat[,subjectHits(findOverlaps(GRanges(c("chr6:31170549-31170836")), x.sp@peak))])

## Write Matrix and Peaks

In [113]:
PEAK_NAME = "idr.smallpeaks"
SESSION = "20200520_n68916"

In [114]:
writeMM(x.sp@pmat, file=sprintf("./sessions/%s/pmat.%s.sparse.mm",SESSION, PEAK_NAME))

NULL

In [115]:
# writing ranges for 5kb and sparse matrix
write.table(data.frame(seqnames=seqnames(x.sp@peak),
                       starts=start(x.sp@peak),
                       ends=end(x.sp@peak)),
            file=sprintf("./sessions/%s/peaks.%s.bed", SESSION, PEAK_NAME),
            quote=F, sep="\t", row.names=F, col.names=F)