To filter:
* 80% middle percentile coverage
* GQ 20
* DP 5
* no Mendelian errors in F0F1 trios

In [1]:
GQ_filter=20
DP_filter=5 # 10 produces tons of missingness and we need loci with 100% genotpying rate in the founders

In [2]:
# set important data in the config file first!
source("config.R")
source("../../extra-R-functions.R")

Auto-refreshing stale OAuth token.

[32mv[39m Reading from [36mStatus of mouse sequencing[39m.

[32mv[39m Range '[33m'All founder files'[39m'.

[32mv[39m Reading from [36mStatus of mouse sequencing[39m.

[32mv[39m Range '[33m'All Fx files'[39m'.


Attaching package: 'tidytable'


The following objects are masked from 'package:stats':

    dt, filter, lag


The following object is masked from 'package:base':

    %in%




In [3]:
library(naturalsort)
library(ggplot2)

In [4]:
files=naturalsort(list.files(path = bam_dir, pattern="chr[0-9].*.phased.vcf.gz$", full.names = TRUE))
files

In [5]:
# first, lets get the DP info of the unfiltered file
jobname="getDP"
for(f in files){
    cmd=paste0("sh -xe ",Barn_Mice_dir,"01_Genotyping/01_Founders/05_Filtering-getDP.sh ",
          f)
    
    if(file.exists(paste0(f,".DP.txt.gz"))){
        next
    }
    
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname=jobname,
                          activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
        Sys.sleep(1)
}

In [6]:
jobname
while(slurm_check_jobs_still_running(columbia_username,jobname)){
    Sys.sleep(60)
}

In [7]:
DP_data=fread_and_bind_files(paste0(files,".DP.txt.gz"))
head(DP_data)

V1,V2,V3,file
<chr>,<int>,<int>,<chr>
NC_000067.6,3000023,15,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz
NC_000067.6,3000126,24,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz
NC_000067.6,3000181,74,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz
NC_000067.6,3000185,74,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz
NC_000067.6,3000191,80,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz
NC_000067.6,3000201,83,/moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.DP.txt.gz


In [8]:
min_DP=quantile(DP_data$V3, c(0.1))
max_DP=quantile(DP_data$V3, c(0.9))

In [9]:
DP_data$V2=as.numeric(as.character(DP_data$V2))

In [10]:
# outside the red lines: filtered out
# DP here is summed across samples; so that we filter out bad sites on average
ggplot(DP_data, aes(V2,V3))+
geom_point(shape=1,alpha=0.1)+
geom_hline(yintercept = c(min_DP,max_DP), color="red")+
facet_wrap(~V1)+theme_bw(18)+scale_y_log10()

In [11]:
# filter by DP percentile; GQ and DP; setting genotypes to ./. if not passing for later filtering of sites that are not missing *in founders*
jobname="filterDPGQDP"
for(f in files){
    cmd=paste0("sh -xe ",Barn_Mice_dir,"01_Genotyping/01_Founders/05_FilterDPGQ.sh ",
          paste(min_DP,max_DP,GQ_filter,DP_filter,f))
    if(file.exists(paste0(f,".Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz"))){
        next
    }
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname=jobname,
                          activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
        Sys.sleep(1)
}


In [12]:
jobname
while(slurm_check_jobs_still_running(columbia_username,jobname)){
    Sys.sleep(60)
}

In [13]:
files=naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz.Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz$"), full.names=TRUE))
files

In [14]:
# get Mendelian trio errors
# convert PED file to the new format that the author had to come up with
# need mother,father,child

jobname="FilterF0F1Mendelian"

PED=fread(paste0(Barn_Mice_dir, "XX_Data/FoundersF1.no1140.ped"))
trios=select(PED, Mother, Father, Individual)
fwrite(trios, paste0(bam_dir,"/trios.txt"), sep=",", col.names = FALSE)

for(f in files){
    
    if(file.exists(paste0(f,".MendelianAnnotated.vcf.gz"))){
        next
    }
    
    cmd=paste0("sh -xe ",Barn_Mice_dir,"01_Genotyping/01_Founders/05_Filtering-Mendelian.sh ", f, " ", paste0(bam_dir,"/trios.txt"))
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname=jobname,
                          activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
        Sys.sleep(1)
}

In [15]:
jobname
while(slurm_check_jobs_still_running(columbia_username,jobname)){
    Sys.sleep(60)
}

In [16]:
# show MERR data from f.MendelianAnnotated.vcf.gz.MERR.txt.gz

MendelianErrors=as.numeric(system(command=paste0("zcat ",paste0(files,".MendelianAnnotated.vcf.gz.MERR.txt.gz")), intern=TRUE))
summary(MendelianErrors)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.04354 0.00000 4.00000 

In [17]:
files=naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz.Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz.MendelianAnnotated.vcf.gz$"), full.names=TRUE))
files

In [18]:
# select founders only

# no mendelian errors

# only biallelic and zero missing genotypes

jobname="FilterF0"

for(f in files){
    
    if(file.exists(paste0(f,".F0.0MERR.0Missing.m2M2.vcf.gz"))){
        next
    }
    
    cmd=paste0("sh -xe ",Barn_Mice_dir,"01_Genotyping/01_Founders/05_Filter-FinalSteps.sh ", f)
    print(cmd)
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname=jobname,
                          activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
        Sys.sleep(1)
}



In [None]:
jobname
while(slurm_check_jobs_still_running(columbia_username,jobname)){
    Sys.sleep(60)
}

In [37]:
all_vcf_files=c(naturalsort(list.files(path = bam_dir, pattern="chr[0-9].*.phased.vcf.gz$", full.names = TRUE)),
               naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz.Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz$"), full.names=TRUE)),
               naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz.Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz$"), full.names=TRUE)))
all_vcf_files=naturalsort(all_vcf_files)

In [41]:
# get a number of loci removed for each filtering step

list_vcf_path<-file.path(bam_dir,"listvcf")
if(max(file.mtime(all_vcf_files))>file.mtime(paste0(list_vcf_path,".counts.txt"))){
    writeLines(all_vcf_files, list_vcf_path)
    cmd=paste0("sh -xe ~/ColumbiaProjects/Barn_Mice/01_Genotyping/01_Founders/05_CountLoci.sh ", list_vcf_path)
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname="countLoci",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(5)
    while(slurm_check_jobs_still_running(columbia_username,"countLoci")){
        Sys.sleep(60)
    }
}
countLoci_df<-fread(paste0(list_vcf_path,".counts.txt"), sep="\t", header=FALSE)
countLoci_df %>%
  mutate(filtering_step = str_extract(V1, "vcf.gz.*")) %>%
  group_by(filtering_step) %>%
  summarise(number_of_loci = sum(V2))


filtering_step,number_of_loci
<chr>,<int>
vcf.gz.phased.vcf.gz,20926458
vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz,16780994
vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz,16780994
vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz,3142133


Note that in the "vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz" (or similar if you change the code / data) step, only the summed coverage is filtered (i.e. poorly covered or perhaps repeat regions are removed), while in the final step, the GQ and DP filter is applied such that all F0 genotypes at a given site need to pass those thresholds.

In [48]:
# merge vcf

# double-check that sample names are in the same order across vcf.
files=naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz.Cov",min_DP,"to",max_DP,".GQ",GQ_filter,".DP",DP_filter,".vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz$"), full.names=TRUE))
files

for(f in files){
    cmd=paste0("bcftools query -l ", f, " > ",f,".samples")
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname="getSamples",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(1)
    
}
Sys.sleep(5)
while(slurm_check_jobs_still_running(columbia_username,"getSamples")){
        Sys.sleep(60)
    }


samples=list()
for(f in files){
    samples[[f]]=readLines(paste0(f,".samples"))
    print(samples[[f]])
}

are_all_vectors_identical <- all(sapply(samples[-1], identical, samples[[1]]))

print(are_all_vectors_identical)

if(!are_all_vectors_identical){
    stop("Assumption of same samples / order in each vcf not met. Please fix.")
}


 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "SM_SW_1"  "SM_SW_2"  "SM_SW_3"  "SM_SW_4"  "SM_SW_5"  "SM_SW_6" 
 [7] "SM_SW_7"  "SM_SW_8"  "SM_SW_9"  "SM_SW_10" "SM_SW_11" "SM_SW_12"
 [1] "

In [51]:
cmd=paste0("bcftools concat -Oz -o ",folder_for_sequences,"Founders.filtered.vcf.gz --threads 4 ",paste(files,collapse=" "))
execute_cmd_sbatch(cmd, mem="8gb", cpu="4", time="short", env=env_mapping_etc, jobname="MergeVCF",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(5)
    

while(slurm_check_jobs_still_running(columbia_username,"MergeVCF")){
        Sys.sleep(60)
    }


[1] "sbatch -c 4 --mem=8gb --job-name=MergeVCF -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; bcftools concat -Oz -o /moto/ziab/users/jr3950/data/genomes/tmp_founders/Founders.filtered.vcf.gz --threads 4 /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr2.vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr3.vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr4.vcf.gz.phased.vcf.gz.Cov105to361.GQ20.DP5.vcf.gz.MendelianAnnotated.vcf.gz.F0.0MERR.0Missing.m2M2.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.

In [53]:
# merge vcf unfiltered and F0 and F1 in case one wants it

# double-check that sample names are in the same order across vcf.
files=naturalsort(list.files(path=bam_dir, pattern=paste0("phased.vcf.gz$"), full.names=TRUE))
files

for(f in files){
    cmd=paste0("bcftools query -l ", f, " > ",f,".samples")
    execute_cmd_sbatch(cmd, mem="4gb", cpu="1", time="short", env=env_mapping_etc, jobname="getSamples",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(1)
    
}
Sys.sleep(5)
while(slurm_check_jobs_still_running(columbia_username,"getSamples")){
        Sys.sleep(60)
    }


samples=list()
for(f in files){
    samples[[f]]=readLines(paste0(f,".samples"))
    print(samples[[f]])
}

are_all_vectors_identical <- all(sapply(samples[-1], identical, samples[[1]]))

print(are_all_vectors_identical)

if(!are_all_vectors_identical){
    stop("Assumption of same samples / order in each vcf not met. Please fix.")
}





[1] "sbatch -c 1 --mem=4gb --job-name=getSamples -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; bcftools query -l /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz > /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz.samples'"
[1] "Submitted batch job 13535379"
[1] "sbatch -c 1 --mem=4gb --job-name=getSamples -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; bcftools query -l /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr2.vcf.gz.phased.vcf.gz > /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr2.vcf.gz.phased.vcf.gz.samples'"
[1] "Submitted batch job 13535380"
[1] "sbatch -c 1 --mem=4gb --job-name=getSamples -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; bcftools query -l /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr3.vcf.gz.phased.vcf

In [55]:
cmd=paste0("bcftools concat -Ob -o ",folder_for_sequences,"FoundersAndF1.unfiltered.bcf --threads 4 ",paste(files,collapse=" "))
execute_cmd_sbatch(cmd, mem="8gb", cpu="4", time="short", env=env_mapping_etc, jobname="MergeVCF2",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(5)
    

while(slurm_check_jobs_still_running(columbia_username,"MergeVCF2")){
        Sys.sleep(60)
    }


[1] "sbatch -c 4 --mem=8gb --job-name=MergeVCF2 -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; bcftools concat -Ob -o /moto/ziab/users/jr3950/data/genomes/tmp_founders/FoundersAndF1.unfiltered.bcf --threads 4 /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr1.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr2.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr3.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr4.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr5.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr6.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr7.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/tmp_founders/bam/Founders.chr8.vcf.gz.phased.vcf.gz /moto/ziab/users/jr3950/data/genomes/t

In [4]:
cmd=paste0("cd ",folder_for_sequences," && sh -xe ", Barn_Mice_dir, "01_Genotyping/01_Founders/05_FinalACFilter.sh")
execute_cmd_sbatch(cmd, mem="8gb", cpu="1", time="short", env=env_mapping_etc, jobname="FinalFilter",
                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh")) 
    Sys.sleep(5)
    

while(slurm_check_jobs_still_running(columbia_username,"FinalFilter")){
        Sys.sleep(60)
    }


[1] "sbatch -c 1 --mem=8gb --job-name=FinalFilter -A ziab -t 11:59:00 --wrap '. ~/ColumbiaProjects/Barn_Mice/activateEnv.sh samtools-116; cd /moto/ziab/users/jr3950/data/genomes/tmp_founders/ && sh -xe ~/ColumbiaProjects/Barn_Mice/01_Genotyping/01_Founders/05_FinalACFilter.sh'"
[1] "Submitted batch job 13584214"
