In [None]:
# calculate coverage variance for each sample
#  to decide which sample to keep in discovery cohort

In [None]:
library(GenomicRanges)
library(parallel)

In [None]:
setwd("..")

### bam files and sample names

In [None]:
bamFiles = list.files(path = "data/star/", pattern = "Aligned.sortedByCoord.out-uniqMap25M10percIDS.bam", 
                      full.names = T, recursive = T, include.dirs = T)
bamFiles = grep("bam$",bamFiles,value=T)
length(bamFiles)
bamFiles

In [None]:
sampleNames = gsub(".*(SXR\\d+).+","\\1",bamFiles)
length(sampleNames)
sampleNames

In [None]:
# dissect chromosomes into equally sized (1Mb) tiles :
chrTileSize = 1e+6 # how big the slices of chr should be
chrSizes = read.csv(file = "annotation/hg19.chrom.sizes",header = F,sep = "\t",
                     stringsAsFactors = F, col.names = c("chr","size"))
chrSizes = chrSizes[chrSizes$chr %in% paste0("chr",c(1:22,"X")),]
o = order(as.numeric(substr(chrSizes$chr,start = 4,5)))
chrSizes = chrSizes[o,]
tmp = chrSizes$chr; chrSizes = chrSizes$size; names(chrSizes) = tmp

In [None]:
chrTilesGR = tileGenome(chrSizes, tilewidth=chrTileSize,cut.last.tile.in.chrom=T)
chrTilesGR
chrTilesDF = as.data.frame(chrTilesGR)

In [None]:
write.table(x = chrTilesDF[,1:3],file = "coverage-variance-chromosome-tiles.bed",
            quote = F,sep = "\t",row.names = F,col.names = F)

In [None]:
countOnefile = function(bamFile) {
  system(command = paste0("samtools bedcov coverage-variance-chromosome-tiles.bed ",bamFile),intern = T)
}

In [None]:
nbCPUs = 20
results = mclapply(bamFiles, countOnefile, mc.cores = nbCPUs)

In [None]:
length(results[[1]])

In [None]:
sapply(results,head,3)

In [None]:
resultsMat = sapply(results, function(res) as.numeric(matrix(unlist(strsplit(res, "\t",fixed=T) ),ncol=4,byrow=T)[,4]))
head(resultsMat)
dim(resultsMat)

In [None]:
resultsDF = data.frame(sample=as.vector(sapply(sampleNames,rep,nrow(resultsMat))),
           coverage=matrix(data = resultsMat,ncol = 1,byrow = T),
           chr=rep(chrTilesDF$seqnames,ncol(resultsMat)),
           pos=rep(chrTilesDF$start,ncol(resultsMat)) )
tail(resultsDF)

In [None]:
# for each sample, calc sd, variance, inter-quatile range of coverage for each chromosome
variances = data.frame(matrix(data = 0, nrow = length(sampleNames),ncol=4,
                               dimnames = c(list(sampleNames),list(c("var","sd","iqr","mean")))) )
#head(variances)
for (sample in sampleNames) {
  var = 0; sd = 0; iqr = 0; mean = 0 # average over all chromosomes
  for (chr in unique(resultsDF$chr)) {
    dat = resultsDF[resultsDF$sample==sample & resultsDF$chr == chr, "coverage"]
    var = c(var,var(dat))
    sd = c(sd,sd(dat))
    iqr = c(iqr,IQR(dat))
    mean = c(mean,mean(dat))
  }
  variances[sample,"var"] = median(var)
  variances[sample,"sd"] = median(sd)
  variances[sample,"iqr"] = median(iqr)
  variances[sample,"mean"] = median(mean)
}
variances$sd_per_mean <- variances$sd / variances$mean
head(variances)

In [None]:
write.table(x = data.frame(IDs = rownames(variances),sd_mean = variances[,5]),
            file = "coverage-variance.csv", quote = F,sep = "\t", row.names = F)