I should be able to get all sites vcf with <https://pixy.readthedocs.io/en/latest/generating_invar/generating_invar.html#generating-allsites-vcfs-using-gatk> and `~/data/trd/GVCF_2489Strains/`

In [4]:
source("../../BrusselSprouts/scripts/functions.R")
scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"
initial_timedate=Sys.time()
library(tidytable)

In [5]:
# make sample name map  samplename\tgvcf

samples=list.files("/home/jnrunge/data/trd/GVCF_2489Strains", "g.vcf.gz$", full.names = TRUE)
samples=data.frame(id=unlist(lapply(basename(samples), getFirst_v2, split=".")),file=samples)
fwrite(samples, "~/data/trd/mapped_reads/truly_all_samples.tsv", col.names = FALSE, sep="\t")

In [6]:
if(!file.exists("~/data/trd/mapped_reads/ALL.DP5-95.chromosome1.DP10.GQ20RGQ20.SNPsRef.vcf.gz")){ # just an example, so that keeping the filtered files on the server is enough
    # unfiltered files are on viseg
    file.create(running_file<-paste0("~/data/TRD/runningGATKFullmerge"))
    cmd="cd ~/data/trd/mapped_reads"# && rm -rf ALL_DB"
    cmd=paste0(cmd, " && gatk GenomicsDBImport --batch-size 200 --genomicsdb-workspace-path ALL_DB --sample-name-map ~/data/trd/mapped_reads/truly_all_samples.tsv -L ~/data/TRD/R64_nucl.fasta.fai.bed")
    cmd=paste0(cmd, " && gatk GenotypeGVCFs --java-options '-Xmx180G' -R ../../TRD/R64_nucl.fasta -all-sites -V gendb://ALL_DB -O ALL.vcf.gz")
    cmd=paste0(cmd, " && bcftools query -l ALL.vcf.gz > ALL.vcf.gz.samples && bcftools view -Ob -o ALL.bcf ALL.vcf.gz && rm -f ~/data/TRD/runningGATKFullmerge")
    execute_cmd_sbatch(cmd, mem="200G", cpu="1", time="long", env="bwaetc", jobname="GATK_merge")

    while(file.exists(running_file)){
        Sys.sleep(60)
    }
}

Now I need to filter and do this carefully. We want to remove indels and low GQ calls. invariant sites have "RGQ" instead of GQ, which is the chance of a wrong call, so here we also want a high number (unlike PL). 

So we want GQ >= 20 or RGQ >= 20 I would think. 

First, we get an overview of the average DP in total, and which positions should be removed because they are outside the 5%-95% range

````bash
bcftools query -f "%CHROM\t%POS\t%INFO/DP\n" ALL.vcf.gz | gzip > ALL.vcf.gz.DP.gz
````


````r
DP=fread("~/data/trd/mapped_reads/ALL.vcf.gz.DP.gz")

print(quantile(as.numeric(DP$V3), c(0.05,0.95), na.rm = TRUE))
````

````bash

# sites filtering depth
bcftools view -i "INFO/DP >= 149240 & INFO/DP <= 408636" -Ob -o ALL.DP5-95.bcf ALL.vcf.gz

bcftools index ALL.DP5-95.bcf

bcftools view ALL.DP5-95.bcf | grep -v ^# | cut -f 1 | uniq > chrs.txt


````

In [7]:
chrs=readLines("~/data/trd/mapped_reads/chrs.txt") # its just a list of chromosomes (see MD block above)
for(c in chrs){
    if(file.exists(paste0("~/data/trd/mapped_reads/ALL.DP5-95.",c,".DP10.GQ20RGQ20.SNPsRef.vcf.gz"))){
        next
    }
    # filtering VCF and splitting into chromosomes
    cmd=paste0('sh -xe ~/TRD/03_GenomicSignals/01_pixy_filter-vcf.sh ',c)
    execute_cmd_sbatch(cmd, mem="8G", cpu="1", time="long", env="bwaetc", jobname="bcftools_filter")
    Sys.sleep(1)
}

In [8]:
samples=readLines("~/data/trd/mapped_reads/ALL.vcf.gz.samples")

In [9]:
# what groups should be run?
pop_files=list()


# each pop
pops=fread("../Shiny/data/Victor/operationalTable_Full2543Sace_Clades.csv")
summary(samples%in%pops$StandardizedName)
popList=left_join(data.table(Strain=samples), select(pops, StandardizedName, Clade), by=c("Strain"="StandardizedName"))
popList=filter(popList, !is.na(Clade))
popList=mutate(popList, Clade=gsub("[ .]","_",Clade))
head(popList)
fwrite(popList, pop_files[["Clades"]]<-"~/data/trd/mapped_reads/ALL.vcf.gz-Clades.popList", sep="\t", col.names = FALSE)

   Mode    TRUE 
logical    2489 

Strain,Clade
<chr>,<chr>
AAA,1__Wine
AAB,8__Belgium_Beer
AAC,10__UK_Beer
AAD,18__Asian_Fermentation
AAE,1__Wine
AAG,16__USA_Clinical_1


In [12]:
selectSimilarity=0.7
df_Strains=fread("../Shiny/data/Victor/operationalTable_Full2543Sace_Clades.csv")

crosses=readLines("~/data/trd/mapped_reads/TRD.vcf.gz.samples")
crosses=crosses[startsWith(crosses, "YJNRC") | startsWith(crosses, "Chris")]
crosses

In [13]:

# add TRD-similar strains vs rest


for(c in crosses){
    AS=fread(paste0("/home/jnrunge/data/TRD/results/shiny/",c,"-AF.csv.gz.allelesharing.csv.gz"))

    TRD=fread(paste0("/home/jnrunge/data/TRD/results/shiny/",c,"-AF.csv.gz"))
    
    if(!file.exists(paste0("/home/jnrunge/data/TRD/results/shiny/",c,"-TRD_regions.csv.gz"))){
        next
    }

    TRD_loci=fread(paste0("/home/jnrunge/data/TRD/results/shiny/",c,"-TRD_regions.csv.gz"))
    
    for(i in 1:nrow(TRD_loci)){
        if(TRD_loci$chr_start[i]!=TRD_loci$chr_end[i]){
        stop("chr overlapping TRD")
    }
        TRD_subset=filter(TRD, chr== TRD_loci$chr_start[i] & global_pos >= TRD_loci$global_start[i] & global_pos <= TRD_loci$global_end[i])
        df_AS_filtered=filter(AS, `#CHROM` == TRD_loci$chr_start[i], POS %in% TRD_subset$pos)
        melted=reshape2::melt(df_AS_filtered, id.vars = c("#CHROM","POS"))
        melted=filter(melted, variable != "chrpos")
        tmp=summarise(group_by(melted, variable), nAll=n())
        vcf_translated_summary=left_join(tmp,summarise(group_by(melted, variable, value), n=n()), by=c("variable"))%>%mutate(p=n/nAll)%>%select(variable,value,p)%>%rename(Strain=variable, Type=value)
        A1s=vcf_translated_summary$Strain[vcf_translated_summary$Type=="A1_hom" & vcf_translated_summary$p>=selectSimilarity]
        A2s=vcf_translated_summary$Strain[vcf_translated_summary$Type=="A2_hom" & vcf_translated_summary$p>=selectSimilarity]
        
        
        strain_summary=bind_rows(summarise(group_by(filter(df_Strains, StandardizedName %in% A1s),
                       StandardizedName), n=n()) %>% arrange(-n)%>%mutate(Type="A1_hom"),
              summarise(group_by(filter(df_Strains, StandardizedName %in% A2s),
                                 StandardizedName), n=n()) %>% arrange(-n)%>%mutate(Type="A2_hom"))%>% arrange(-n)
        
        if(mean(TRD_subset$AD_A1/TRD_subset$sumCount)<0.5){
        distorter="A2"
        nondistorter="A1"
    }else{
        distorter="A1"
        nondistorter="A2"}
        
        pop_list_trd=data.frame(sample=samples,pop="other",stringsAsFactors = FALSE)
        pop_list_trd$pop[pop_list_trd$sample%in%strain_summary$StandardizedName[strain_summary$Type==paste0(distorter,"_hom")]]="distorter-like"
        
        
        fwrite(pop_list_trd, pop_files[[paste0("TRD_",c,"_",i,"_",selectSimilarity)]]<-paste0("~/data/trd/mapped_reads/ALL.vcf.gz-",paste0("TRD_",c,"_",i,"_",selectSimilarity),".popList"), sep="\t", col.names = FALSE)
    }
}






ERROR: Error in eval(expr, envir, enclos): chr overlapping TRD


In [16]:
c
TRD_loci

ID,lengthSNPs,chr_start,chr_end,global_start,global_end
<int>,<int>,<chr>,<chr>,<int>,<int>
6,889,chromosome4,chromosome5,2879861,2921562
7,2859,chromosome5,chromosome5,2952587,3256268
8,1215,chromosome5,chromosome5,3378765,3415698
11,526,chromosome10,chromosome10,6441979,6477149
12,788,chromosome10,chromosome10,6479143,6516974
13,934,chromosome11,chromosome11,6557339,6708982
14,652,chromosome12,chromosome12,8231277,8243715


In [None]:
pop_files

In [19]:
# c is being reused below so should be run after all is prepared above

In [20]:
for(p in 1:length(pop_files)){
    for(c in chrs){
        
        if(file.exists(paste0("~/data/trd/mapped_reads/ALL.DP5-95.",c,".DP10.GQ20RGQ20.SNPsRef.vcf.gz-",names(pop_files)[p],"-pixy_pi.txt"))){
            next
        }
    
        cmd=paste0("cd ~/data/trd/mapped_reads/ && ",
               "pixy --n_cores 2 --stats dxy fst pi --populations ",
              pop_files[p], " --vcf ",
              "ALL.DP5-95.",c,".DP10.GQ20RGQ20.SNPsRef.vcf.gz", " --output_prefix ",
              "ALL.DP5-95.",c,".DP10.GQ20RGQ20.SNPsRef.vcf.gz-",names(pop_files)[p],"-pixy --window_size 10000 && ",
                  " gzip -f ALL.DP5-95.",c,".DP10.GQ20RGQ20.SNPsRef.vcf.gz-",names(pop_files)[p],"-pixy*txt")
        execute_cmd_sbatch(cmd, mem="32G", cpu="2", time="long", env="bwaetc", jobname="pixy")
        Sys.sleep(1)
        }
}

[1] "sbatch -c 2 --mem=32G --job-name=pixy -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; cd ~/data/trd/mapped_reads/ && pixy --n_cores 2 --stats dxy fst pi --populations ~/data/trd/mapped_reads/ALL.vcf.gz-TRD_ChrisC7_3_0.7.popList --vcf ALL.DP5-95.chromosome4.DP10.GQ20RGQ20.SNPsRef.vcf.gz --output_prefix ALL.DP5-95.chromosome4.DP10.GQ20RGQ20.SNPsRef.vcf.gz-TRD_ChrisC7_3_0.7-pixy --window_size 10000'"
[1] "Submitted batch job 494280"
[1] "sbatch -c 2 --mem=32G --job-name=pixy -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; cd ~/data/trd/mapped_reads/ && pixy --n_cores 2 --stats dxy fst pi --populations ~/data/trd/mapped_reads/ALL.vcf.gz-TRD_ChrisC7_3_0.7.popList --vcf ALL.DP5-95.chromosome7.DP10.GQ20RGQ20.SNPsRef.vcf.gz --output_prefix ALL.DP5-95.chromosome7.DP10.GQ20RGQ20.SNPsRef.vcf.gz-TRD_ChrisC7_3_0.7-pixy --window_size 10000'"
[1] "Submitted batch job 494281"
[1] "sbatch -c 2 --mem=32G --job-name=pixy -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; cd ~/data/trd/ma