I want to get a list of strains that share the allele1 or allele2 per locus per cross so that I can dynamically explore the phylogeny in this regard

In [179]:
source("~/BrusselSprouts/scripts/functions.R")
library(tidytable)

files=list.files(path = "/home/jnrunge/data/TRD/results/shiny/", pattern="AF.csv.gz$", full.names = TRUE)

for(f in files){
    df=fread(f,data.table=TRUE)
    fwrite(select(df,chr,pos),paste0(f,".tmp.loci"), sep="\t", col.names = FALSE)
    # make a subset vcf at those loci 
    if(file.exists(paste0(f,".loci.full.vcf.gz"))){
        next
    }
    cmd=paste0("bcftools view -R ",paste0(f,".tmp.loci"), " -Oz -o ",paste0(f,".loci.full.vcf.gz")," ~/data/trd/full2489Matrix.vcf.gz")
    execute_cmd_sbatch(cmd, mem="4G", cpu="1", time="long", env="bwaetc", jobname="getLoci")
    Sys.sleep(1)
    # run cmd
}

[1] "sbatch -c 1 --mem=4G --job-name=getLoci -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; bcftools view -R /home/jnrunge/data/TRD/results/shiny//ChrisC2-AF.csv.gz.tmp.loci -Oz -o /home/jnrunge/data/TRD/results/shiny//ChrisC2-AF.csv.gz.loci.full.vcf.gz ~/data/trd/full2489Matrix.vcf.gz'"
[1] "Submitted batch job 491762"
[1] "sbatch -c 1 --mem=4G --job-name=getLoci -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; bcftools view -R /home/jnrunge/data/TRD/results/shiny//ChrisC3-AF.csv.gz.tmp.loci -Oz -o /home/jnrunge/data/TRD/results/shiny//ChrisC3-AF.csv.gz.loci.full.vcf.gz ~/data/trd/full2489Matrix.vcf.gz'"
[1] "Submitted batch job 491763"
[1] "sbatch -c 1 --mem=4G --job-name=getLoci -p fast -t 119:59:00 --wrap '. ~/activate.sh bwaetc; bcftools view -R /home/jnrunge/data/TRD/results/shiny//ChrisC4-AF.csv.gz.tmp.loci -Oz -o /home/jnrunge/data/TRD/results/shiny//ChrisC4-AF.csv.gz.loci.full.vcf.gz ~/data/trd/full2489Matrix.vcf.gz'"
[1] "Submitted batch job 491764"
[1] "sbatch -c 1

In [182]:
for(f in files){

    cmd=paste0("Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R ", f)
    execute_cmd_sbatch(cmd, "8G", "1", "long", "JupyteR4", "getAS")
    Sys.sleep(1)
    
    }

[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC1-AF.csv.gz'"
[1] "Submitted batch job 491861"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC2-AF.csv.gz'"
[1] "Submitted batch job 491862"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC3-AF.csv.gz'"
[1] "Submitted batch job 491863"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC4-AF.csv.gz'"
[1] "Submitted batch job 491864"


In [None]:
vcf=fread(cmd=paste0("zcat ",paste0(f,".loci.full.vcf.gz")," | grep -v ^##"), data.table = FALSE)
setDF(vcf)

vcf$chrpos=paste(vcf$`#CHROM`,vcf$POS)
df$chrpos=paste(df$chr,df$pos)

getA1A2sharersPerLocus=function(x){
    if(vcf$chrpos[x]%in%df$chrpos){
        which_df_row=which(df$chrpos==vcf$chrpos[x])
        vcf_alleles=strsplit(paste(vcf$REF[x],vcf$`ALT...5`[x],sep=","),",",fixed=TRUE)[[1]]
        df_alleles=strsplit(df$alleles[which_df_row],",",fixed=TRUE)[[1]]
        A1=which(vcf_alleles==df_alleles[as.numeric(df$Allele1[which_df_row])+1])-1 # 0 = REF
        A2=which(vcf_alleles==df_alleles[as.numeric(df$Allele2[which_df_row])+1])-1
        t_vcf=t(vcf[x,10:ncol(vcf)])
        A1_homs=which(substr(t_vcf, 1,1)==A1 & substr(t_vcf, 3,3) == A1)
        A2_homs=which(substr(t_vcf, 1,1)==A2 & substr(t_vcf, 3,3) == A2)
        vcf_return=vcf[x,c(1,2,10:ncol(vcf))]
        vcf_return[1,3:ncol(vcf_return)]="Other"
        vcf_return[1,2+A1_homs]="A1_hom"
        vcf_return[1,2+A2_homs]="A2_hom"
        return(vcf_return)
        #return(tibble(Strain=c(colnames(vcf)[A1_homs+9],colnames(vcf)[A2_homs+9]),
        #              Type=c(rep("A1_hom",length(A1_homs)),
        #                   rep("A2_hom",length(A2_homs)))))
    }else{
        return(NULL)
    }
}

summarise_strains=function(x){
    # x = strain
    return(summarise(group_by(vcf_translated, all_of(x)), p=n()/nrow(vcf_translated))%>%mutate(Strain=x)%>%rename(Type=all_of(x))%>%select(Strain,Type,p))
}

wrapperDfLocus=function(x){
    y=which(vcf$chrpos==df$chrpos[x])
    if(length(y)==0){
        return(NULL)
    }
    return(getA1A2sharersPerLocus(y))
}

vcf_translated=setDT(data.table::rbindlist(lapply(1:nrow(df),wrapperDfLocus)))
colnames(vcf_translated)[colnames(vcf_translated)=="ALT...204"]="ALT"


fwrite(vcf_translated, paste0(f,".allelesharing.csv.gz"))

In [175]:
# in shiny, needs chr pos selection
a=Sys.time()
melted=reshape2::melt(vcf_translated, id.vars = c("#CHROM","POS"))
tmp=summarise(group_by(melted, variable), nAll=n())
vcf_translated_summary=left_join(tmp,summarise(group_by(melted, variable, value), n=n()), by=c("variable"))%>%mutate(p=n/nAll)%>%select(variable,value,p)%>%rename(Strain=variable, Type=value)
head(vcf_translated_summary)
b=Sys.time()
b-a

Strain,Type,p
<fct>,<chr>,<dbl>
AAA,A1_hom,0.6
AAA,A2_hom,0.38
AAA,Other,0.02
AAB,A1_hom,0.57
AAB,A2_hom,0.41
AAB,Other,0.02


Time difference of 0.1242888 secs