In [9]:
# given a set of csv files from bedtools coverage
# find spots throughout the genome (maybe intergenic)
# where there are at least 5 reads in at least 20% of the bams

library(Rsamtools)
library(GenomicRanges)
library(parallel) 

“package ‘Rsamtools’ was built under R version 4.2.3”
Loading required package: GenomeInfoDb

“package ‘GenomeInfoDb’ was built under R version 4.2.3”
Loading required package: BiocGenerics

“package ‘BiocGenerics’ was built under R version 4.2.1”

Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

“package ‘S4Vectors’ was built under R version 4.2.3”
Loading required package: stats4


Attaching package: ‘S4Vectors’


The following

In [1]:
setwd("..")

In [7]:
coverageDir = "data/coverage/"
sampleIDfile = "annotation/sample-ids-63.txt"
covgFileSuffix = "-bedcov-uniqMap25M10percIDS.csv"
featureCountFile = "data/expressedTiles-featureCounts-25M10percIDS-min5reads20percent.RData"

leastNumReads = 5 # at least 5 reads in a region to be called "expressed" in a samples
leastPercSamples = 20 # least percentage of samples with this expression
nbCPUs = 6 

In [4]:
neededSamples = readLines(con = sampleIDfile)
head(neededSamples); length(neededSamples)
csvFiles = paste0(coverageDir,neededSamples, covgFileSuffix)
head(csvFiles)
cat("all needed files exist? ",all(file.exists(csvFiles)),"\n")

all needed files exist?  TRUE 


In [5]:
# read in 1st sample to get the rownames (coordinates of genome slices)
startt = Sys.time()
tab = read.csv(file = csvFiles[1],header = F,stringsAsFactors = F,sep="\t")
endt = Sys.time()
difftime(endt, startt) # 1.1min for 1 sample
head(tab)
coo = tab[,1:3]
head(coo)
resultMat = matrix(tab$V4,ncol = 1)
base = neededSamples[1]
colnames(resultMat) = base
rownames(resultMat) = paste0(coo[,1],":",coo[,2],"-",coo[,3])
dim(resultMat)
head(resultMat)
colSums(resultMat)
length(which(is.na(resultMat[,1])))
# 0 NAs

Time difference of 56.09442 secs

Unnamed: 0_level_0,V1,V2,V3,V4
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>
1,chr1,1,100,0
2,chr1,101,200,0
3,chr1,201,300,0
4,chr1,301,400,0
5,chr1,401,500,0
6,chr1,501,600,0


Unnamed: 0_level_0,V1,V2,V3
Unnamed: 0_level_1,<chr>,<int>,<int>
1,chr1,1,100
2,chr1,101,200
3,chr1,201,300
4,chr1,301,400
5,chr1,401,500
6,chr1,501,600


Unnamed: 0,SXR0002
chr1:1-100,0
chr1:101-200,0
chr1:201-300,0
chr1:301-400,0
chr1:401-500,0
chr1:501-600,0


In [6]:
readIn1file = function(fileName) {
    tab = read.csv(file = fileName, header=F, stringsAsFactors = F,sep="\t")
    f  = file(description = "log-findHotsp.txt",open="a")
    writeLines(text=paste0(fileName," .. done"),con=f)
    close(f)
    v = matrix(data = tab$V4, ncol=1)
    colnames(v) = fileName
    return(v)
}

In [10]:
f = file(description = "log-findHotsp.txt", open = "w")
writeLines(text="", con=f)
close(f)
startt = Sys.time()
resultList = mclapply(X=csvFiles[-1],FUN = readIn1file, mc.cores = nbCPUs)
resultMatAll = cbind(resultMat, do.call(cbind,resultList))
endt = Sys.time()

round(difftime(endt, startt, units="min"))
# about 10 min with 6 cpus

Time difference of 10 mins

In [11]:
resultMat = resultMatAll
rm(resultMatAll)

In [12]:
head(resultMat)

Unnamed: 0,SXR0002,data/coverage/SXR0004-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0006-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0010-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0014-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0016-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0018-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0028-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0029-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0030-bedcov-uniqMap25M10percIDS.csv,⋯,data/coverage/SXR0105-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0111-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0112-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0113-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0114-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0115-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0117-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0118-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0121-bedcov-uniqMap25M10percIDS.csv,data/coverage/SXR0122-bedcov-uniqMap25M10percIDS.csv
chr1:1-100,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:101-200,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:201-300,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:301-400,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:401-500,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1:501-600,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [13]:
colnames(resultMat) = gsub(".*(SXR\\d+)\\-?.*","\\1",colnames(resultMat))

In [14]:
colSums(resultMat[1:10000,1:3]) # sneak peek how many reads in sum

In [15]:
quantile(rowSums(resultMat),c(0,0.1,0.25,0.5,0.75,0.9,1))

In [16]:
# save whole matrix as feature counts:
fcountGenes = list()
fcountGenes$counts <- resultMat
rownames(fcountGenes$counts) =  gsub(":|-","_",rownames(fcountGenes$counts))
fcountGenes$counts[1:2,1:3]
fcountGenes$annotation = 
  data.frame(GeneID = rownames(fcountGenes$counts), stringsAsFactors = F)
#fcountGenes$annotation$GeneID = gsub(":|-","_",fcountGenes$annotation$GeneID)
head(fcountGenes$annotation$GeneID)
fcountGenes$annotation$Chr = gsub("(chr\\w+)_\\d+_\\d+","\\1",fcountGenes$annotation$GeneID)
fcountGenes$annotation$Start =
  as.numeric(gsub("chr\\w+_(\\d+)_.+","\\1",fcountGenes$annotation$GeneID))
quantile(fcountGenes$annotation$Start)
fcountGenes$annotation$Length = 100
head(fcountGenes$annotation,3)
totalReads = colSums(fcountGenes$counts)
head(totalReads)

Unnamed: 0,SXR0002,SXR0004,SXR0006
chr1_1_100,0,0,0
chr1_101_200,0,0,0


Unnamed: 0_level_0,GeneID,Chr,Start,Length
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,chr1_1_100,chr1,1,100
2,chr1_101_200,chr1,101,100
3,chr1_201_300,chr1,201,100


In [17]:
quantile(totalReads,c(0,0.1,0.25,0.5,0.75,0.9,1))

In [18]:
# screen: for each region, how many samples have at least 5 reads
#   caution: needs a lot of RAM, maybe only split into 4-10 parallel processes
# nbCPUs = 6
idx = 1:nrow(resultMat)
idxList = split(idx, sort(idx %% nbCPUs))

nbHiCovSamplesPerSlice = mclapply(mc.cores = nbCPUs, X = 1:nbCPUs, 
        function(i) {
            apply(resultMat[ idxList[[i]],],1,function(r) length(which(r >= leastNumReads)) )
        }
    )

In [19]:
nbHiCovSamplesPerSlice = do.call(c,nbHiCovSamplesPerSlice)

In [20]:
length(nbHiCovSamplesPerSlice)
head(nbHiCovSamplesPerSlice)
quantile(nbHiCovSamplesPerSlice)

In [21]:
floor(length(csvFiles)/5)

In [22]:
# reduce expression matrix to expressed regions

leastNumSamples = floor(leastPercSamples*length(neededSamples)/100)
leastNumSamples
wh <- which(nbHiCovSamplesPerSlice >= leastNumSamples)
length(wh); round(100*length(wh)/nrow(resultMat),5)
# min 5 reads in min 20% cohort: 6402, 0.02%

In [None]:
expressedTilesCounts5_20 = resultMat[wh,]
dim(expressedTilesCounts5_20)

In [24]:
fcountGenes = list()
fcountGenes$counts <- expressedTilesCounts5_20
fcountGenes$counts[1:5,1:5]
rownames(fcountGenes$counts) = gsub(":|-","_",rownames(fcountGenes$counts))
fcountGenes$annotation =
  data.frame(GeneID = rownames(expressedTilesCounts5_20), stringsAsFactors = F)
fcountGenes$annotation$GeneID = gsub(":|-","_",fcountGenes$annotation$GeneID)
fcountGenes$annotation$Chr = gsub("(chr\\w+)_\\d+_\\d+","\\1",fcountGenes$annotation$GeneID)
fcountGenes$annotation$Start = 
  as.numeric(gsub("chr\\w+_(\\d+)_.+","\\1",fcountGenes$annotation$GeneID))
quantile(fcountGenes$annotation$Start)
fcountGenes$annotation$Length = 100
head(fcountGenes$annotation)
totalReads = colSums(fcountGenes$counts)
save(fcountGenes,totalReads,file = featureCountFile)

Unnamed: 0,SXR0002,SXR0004,SXR0006,SXR0010,SXR0014
chr1:1337301-1337400,0,0,3,3,10
chr1:1477001-1477100,0,1,3,2,7
chr1:1477101-1477200,0,0,0,0,2
chr1:1716701-1716800,0,0,0,4,6
chr1:1716801-1716900,0,0,0,3,5


Unnamed: 0_level_0,GeneID,Chr,Start,Length
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,chr1_1337301_1337400,chr1,1337301,100
2,chr1_1477001_1477100,chr1,1477001,100
3,chr1_1477101_1477200,chr1,1477101,100
4,chr1_1716701_1716800,chr1,1716701,100
5,chr1_1716801_1716900,chr1,1716801,100
6,chr1_1717201_1717300,chr1,1717201,100
