The goal here is to joint-call variants across the genome using GATK. Even though we don't need variants per se, we want to use GATK's repositioning (INDELS) to have as comparable a data set as possible, then extract the allele frequencies for TRD calling.

What I need to do here to make sure that there is no confusion about the positions of opposite homozygotes is to call SNPs using the joint calling with the 2K matrix.

In [1]:
source("../../BrusselSprouts/scripts/functions.R")


scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"

initial_timedate=Sys.time()

# what needs to be done is individually, and parallely genotype all crosses, then merge them into a DB, and once call the joint genotyping (once per batch of sequencing, so twice in total)

bam_files <- list.files(
  path = "~/data/trd/mapped_reads",
  pattern = "YJNRC[0-9]*.bam$",
  full.names = TRUE
) %>% 
  grep(pattern = "dontuse", invert = TRUE, value = TRUE)


bam_files=c(bam_files, list.files(path="~/data/trd/mapped_reads",pattern="ChrisC[0-9].bam$",full.names = TRUE))
bam_files=naturalsort(bam_files)
crosses=unlist(lapply(basename(bam_files), getFirst_v2, split="."))

for(cross in crosses){
    print(cross)

    if(file.exists(paste0("/home/jnrunge/data/trd/mapped_reads/",cross,".g.vcf.gz"))){
        if(file.mtime(paste0("/home/jnrunge/data/trd/mapped_reads/",cross,".g.vcf.gz"))>file.mtime(file.path("/home/jnrunge/data/trd/mapped_reads/",paste0(cross,".bam")))){
            print("GVCF exists")
            next
        }
    }
    stop()
    file.create(running_file<-paste0("~/data/TRD/",cross,".runningGATK"))
    cmds=paste0("cd /home/jnrunge/data/trd/mapped_reads/ && gatk AddOrReplaceReadGroups -I ",cross,".bam -O ",cross,".ReadGroups.bam --RGID ",cross," --RGLB ",cross," --RGPL ILLUMINA --RGPU ",cross," --RGSM ",cross,"")
    cmds=c(cmds,paste0("mv -f ",cross,".ReadGroups.bam ",cross,".bam"))
    cmds=c(cmds,paste0("samtools index ",cross,".bam"))
    cmds=c(cmds,paste0("gatk HaplotypeCaller -R ../../TRD/R64_nucl.fasta -I ",cross,".bam -O ",cross,".g.vcf.gz --emit-ref-confidence GVCF && rm -f ","~/data/TRD/",cross,".runningGATK"))

    sbatch_list=execute_complex_sbatch(cmds, jobname = jobname<-"GATK", scripts_dir = scripts_dir, uniqueRunID = cross, cores="1", mem="32gb", time="long", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 20)


}

if(exists("sbatch_list")){
    print("Batching up...")
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
    }

while(length(list.files(path = "~/data/TRD", pattern = ".runningGATK$"))>0){
    Sys.sleep(60)
}


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




[1] "ChrisC1"
[1] "GVCF exists"
[1] "ChrisC2"
[1] "GVCF exists"
[1] "ChrisC3"
[1] "GVCF exists"
[1] "ChrisC4"
[1] "GVCF exists"
[1] "ChrisC5"
[1] "GVCF exists"
[1] "ChrisC6"
[1] "GVCF exists"
[1] "ChrisC7"
[1] "GVCF exists"
[1] "ChrisC8"
[1] "GVCF exists"
[1] "YJNRC1"
[1] "GVCF exists"
[1] "YJNRC2"
[1] "GVCF exists"
[1] "YJNRC3"
[1] "GVCF exists"
[1] "YJNRC4"
[1] "GVCF exists"
[1] "YJNRC5"
[1] "GVCF exists"
[1] "YJNRC6"
[1] "GVCF exists"
[1] "YJNRC7"
[1] "GVCF exists"
[1] "YJNRC8"
[1] "GVCF exists"
[1] "YJNRC9"
[1] "GVCF exists"
[1] "YJNRC11"
[1] "GVCF exists"
[1] "YJNRC12"
[1] "GVCF exists"
[1] "YJNRC14"
[1] "GVCF exists"
[1] "YJNRC15"
[1] "GVCF exists"
[1] "YJNRC16"
[1] "GVCF exists"
[1] "YJNRC17"
[1] "GVCF exists"
[1] "YJNRC18"
[1] "GVCF exists"
[1] "YJNRC19"
[1] "GVCF exists"
[1] "YJNRC20"
[1] "GVCF exists"
[1] "YJNRC21"
[1] "GVCF exists"
[1] "YJNRC22"
[1] "GVCF exists"
[1] "YJNRC23"
[1] "GVCF exists"
[1] "YJNRC24"
[1] "GVCF exists"
[1] "YJNRC25"
[1] "GVCF exists"
[1] "YJNRC26"
[1]

In [2]:
library(readxl)

crosses=list.files("~/data/trd/mapped_reads",pattern = "g.vcf.gz$",full.names = TRUE)

my_crosses=read_xlsx("~/data/trd/Crosses.xlsx", sheet=2)[,c("Short name 1", "Short name 2", "Status", "Colonies in pool")]
chris_crosses=data.frame(sample1=c("ACP","BAP","CCD","ATE","ACK","AKE","BAH","ANG"),
                        sample2=c("BFP","CMP","CPG","YCR","ACV","BAH","CGD","CEI"))
all_used_samples=unique(c(my_crosses$`Short name 1`, my_crosses$`Short name 2`, chris_crosses$sample1, chris_crosses$sample2))
all_used_samples[all_used_samples=="YCR"]=paste0("SACE_",all_used_samples[all_used_samples=="YCR"])
summary(file.exists(paste0("/home/jnrunge/data/trd/GVCF_2489Strains/",all_used_samples,".g.vcf.gz")))

all_used_samples[!file.exists(paste0("/home/jnrunge/data/trd/GVCF_2489Strains/",all_used_samples,".g.vcf.gz"))]


samples=c(crosses,paste0("/home/jnrunge/data/trd/GVCF_2489Strains/",all_used_samples,".g.vcf.gz"))
samples=data.frame(id=unlist(lapply(basename(samples), getFirst_v2, split=".")),file=samples)
fwrite(samples, "~/data/trd/mapped_reads/all_samples.tsv", col.names = FALSE, sep="\t")
samples

[1m[22mNew names:
[36m*[39m `Is in Stock` -> `Is in Stock...4`
[36m*[39m `Is in Stock` -> `Is in Stock...6`


   Mode    TRUE 
logical      57 

id,file
<chr>,<chr>
ChrisC1,/home/jnrunge/data/trd/mapped_reads/ChrisC1.g.vcf.gz
ChrisC2,/home/jnrunge/data/trd/mapped_reads/ChrisC2.g.vcf.gz
ChrisC3,/home/jnrunge/data/trd/mapped_reads/ChrisC3.g.vcf.gz
ChrisC4,/home/jnrunge/data/trd/mapped_reads/ChrisC4.g.vcf.gz
ChrisC5,/home/jnrunge/data/trd/mapped_reads/ChrisC5.g.vcf.gz
ChrisC6,/home/jnrunge/data/trd/mapped_reads/ChrisC6.g.vcf.gz
ChrisC7,/home/jnrunge/data/trd/mapped_reads/ChrisC7.g.vcf.gz
ChrisC8,/home/jnrunge/data/trd/mapped_reads/ChrisC8.g.vcf.gz
YJNRC1,/home/jnrunge/data/trd/mapped_reads/YJNRC1.g.vcf.gz
YJNRC11,/home/jnrunge/data/trd/mapped_reads/YJNRC11.g.vcf.gz


In [3]:
# make BED file (all positions because GATK is retarded)
fai=fread("~/data/TRD/R64_nucl.fasta.fai")
bed=data.frame(chr=fai$V1, from=1, to=fai$V2)
head(bed)
fwrite(bed, "~/data/TRD/R64_nucl.fasta.fai.bed", col.names = FALSE, sep="\t")

Unnamed: 0_level_0,chr,from,to
Unnamed: 0_level_1,<chr>,<dbl>,<int>
1,chromosome1,1,230218
2,chromosome2,1,813184
3,chromosome3,1,316620
4,chromosome4,1,1531933
5,chromosome5,1,576874
6,chromosome6,1,270161


In [5]:
if (file.mtime("~/data/trd/mapped_reads/TRD.vcf.gz") < max(file.mtime(samples$file))) {
  # if new samples can be added to the big vcf, then merge again

  file.create(running_file <- paste0("~/data/TRD/runningGATKmerge"))
  cmd <- "cd ~/data/trd/mapped_reads && rm -rf TRD_DB"
  cmd <- paste0(cmd, " && gatk GenomicsDBImport --batch-size 200 --genomicsdb-workspace-path TRD_DB --sample-name-map ~/data/trd/mapped_reads/all_samples.tsv -L ~/data/TRD/R64_nucl.fasta.fai.bed")
  cmd <- paste0(cmd, " && gatk GenotypeGVCFs -R ../../TRD/R64_nucl.fasta -V gendb://TRD_DB -O TRD.vcf.gz")
  cmd <- paste0(cmd, " && bcftools query -l TRD.vcf.gz > TRD.vcf.gz.samples && rm -f ~/data/TRD/runningGATKmerge")
  execute_cmd_sbatch(cmd, mem = "32G", cpu = "1", time = "long", env = "bwaetc", jobname = "GATK_merge")

  while (file.exists(running_file)) {
    Sys.sleep(60)
  }
}

In [6]:
Crosses=readxl::read_xlsx("~/data/trd/Crosses.xlsx", sheet=2)
head(Crosses<-Crosses[,c("Cross ID","Short name 1","Short name 2")])
tail(Crosses)

[1m[22mNew names:
[36m*[39m `Is in Stock` -> `Is in Stock...4`
[36m*[39m `Is in Stock` -> `Is in Stock...6`


Cross ID,Short name 1,Short name 2
<chr>,<chr>,<chr>
YJNRC1,BAK,BET
YJNRC2,BAN,BTI
YJNRC3,AKH,BQC
YJNRC4,ABS,BEF
YJNRC5,BAK,BMK
YJNRC6,BAN,CRB


Cross ID,Short name 1,Short name 2
<chr>,<chr>,<chr>
YJNRC29,AAR,CPG
YJNRC30,ABG,CPG
YJNRC31,AKI,CPG
YJNRC32,ATE,CPG
YJNRC33,AAR,AKH
YJNRC34,AKH,AKQ


In [13]:
gvcf="/home/jnrunge/data/trd/mapped_reads/TRD.vcf.gz"

In [14]:
crosses=unlist(lapply(basename(bam_files), getFirst_v2, split="."))
initial_timedate=Sys.time()
jobname="OHLoci"
scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"
for(cross in crosses){
    run=FALSE
    print(cross)
    #gvcf<-paste0("/home/jnrunge/data/trd/mapped_reads/",cross,".g.vcf.gz")
    if(file.exists(ohfile<-paste0("~/data/trd/mapped_reads/",cross,".hetLoci.gz"))){
        if(file.mtime(gvcf)>file.mtime(ohfile)){
            run=TRUE
            
        }
    }else{
        run=TRUE
    }
    if(run == TRUE){
            file.create(running_file<-paste0("~/data/TRD/Oppo-Homo-Pos/",cross,".runningOHLOCI"))
            cmd=paste0("cd /home/jnrunge/data/trd/mapped_reads/ && sh ~/TRD/01_Mapping/02_OHLoci2.sh ",cross)
            cmd=paste0(cmd," && rm -f ",running_file)
            sbatch_list=execute_complex_sbatch(cmd, jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = cross, cores="1", mem="8G", time="short", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)
    }
    }
if(exists("sbatch_list") &  jobname == "OHLoci"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}
while(length(list.files(path = "~/data/TRD/Oppo-Homo-Pos", pattern = ".runningOHLOCI$"))>0){
    Sys.sleep(60)
}

[1] "ChrisC1"
[1] "ChrisC2"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC2.sbatch"
[1] "ChrisC3"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC3.sbatch"
[1] "ChrisC4"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC4.sbatch"
[1] "ChrisC5"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC5.sbatch"
[1] "ChrisC6"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC6.sbatch"
[1] "ChrisC7"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC7.sbatch"
[1] "ChrisC8"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/OHLoci-ChrisC8.sbatch"
[1] "YJNRC1"
[1] "YJNRC2"
[1] "YJNRC3"
[1] "YJNRC4"
[1] "YJNRC5"
[1] "YJNRC6"
[1] "YJNRC7"
[1] "YJNRC8"
[1] "YJNRC9"
[1] "YJNRC11"
[1] "YJNRC12"
[1] "YJNRC14"
[1] "YJNRC15"
[1] "YJNRC16"
[1] "YJNRC17"
[1] "YJNRC18"
[1] "YJNRC19"
[1] "YJNRC20"
[1] "YJNRC21"
[1] "YJNRC22"
[1] "YJNRC23"
[1] "YJNRC24"
[1] "YJNRC25"
[1] "YJNRC26"
[1] "YJNRC27"
[1] "YJNRC29"
[1] "YJNRC30"
[1] "YJNRC31"
[

In [15]:
jobname="OHLoci2"
remove("sbatch_list")
for(s in samples$id){
    if(grepl("YJNRC",s,fixed = TRUE)|grepl("Chris",s,fixed = TRUE)){
        next
    }
    run=FALSE
    print(s)
    if(file.exists(ohfile<-paste0("~/data/trd/mapped_reads/",s,".homLoci.gz"))){
        if(file.mtime(gvcf)>file.mtime(ohfile)){
            run=TRUE
            
        }
    }else{
        run=TRUE
    }
    if(run == TRUE){
            file.create(running_file<-paste0("~/data/TRD/Oppo-Homo-Pos/",s,".runningOHLOCI"))
            cmd=paste0("cd /home/jnrunge/data/trd/mapped_reads/ && sh ~/TRD/01_Mapping/02_OHLoci3.sh ",s)
            cmd=paste0(cmd," && rm -f ",running_file)
            sbatch_list=execute_complex_sbatch(cmd, jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = s, cores="1", mem="8G", time="short", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 20)
        Sys.sleep(1)
        }
    }
if(exists("sbatch_list") &  jobname == "OHLoci2"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 5, jobname, initial_timedate)
}
while(length(list.files(path = "~/data/TRD/Oppo-Homo-Pos", pattern = ".runningOHLOCI$"))>0){
    Sys.sleep(60)
}

[1] "BAK"
[1] "BAN"
[1] "AKH"
[1] "ABS"
[1] "AMM"
[1] "ABE"
[1] "BFC"
[1] "BID"
[1] "ACT"
[1] "AKV"
[1] "AMD"
[1] "ANI"
[1] "AND"
[1] "ANH"
[1] "AVI"
[1] "ACI"
[1] "ABC"
[1] "ACN"
[1] "AAM"
[1] "ABG"
[1] "AAR"
[1] "ABA"
[1] "BEF"
[1] "AKI"
[1] "ATE"
[1] "BET"
[1] "BTI"
[1] "BQC"
[1] "BMK"
[1] "CRB"
[1] "BKL"
[1] "ABL"
[1] "CCF"
[1] "CCC"
[1] "CCG"
[1] "CMQ"
[1] "BFQ"
[1] "BFR"
[1] "BHN"
[1] "BFD"
[1] "ABP"
[1] "AKS"
[1] "AKQ"
[1] "CPG"
[1] "ACP"
[1] "BAP"
[1] "CCD"
[1] "ACK"
[1] "AKE"
[1] "BAH"
[1] "ANG"
[1] "BFP"
[1] "CMP"
[1] "SACE_YCR"
[1] "ACV"
[1] "CGD"
[1] "CEI"
