I want to get a list of strains that share the allele1 or allele2 per locus per cross so that I can dynamically explore the phylogeny in this regard

In [1]:
library(tidytable)
source("~/BrusselSprouts/scripts/functions.R")

files=list.files(path = "/home/jnrunge/data/TRD/results/shiny/", pattern="AF.csv.gz$", full.names = TRUE)

initial_timedate=Sys.time()
jobname="getLoci"
scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"

for(f in files){
    df=fread(f,data.table=TRUE)
    fwrite(select(df,chr,pos),paste0(f,".tmp.loci"), sep="\t", col.names = FALSE)
    # make a subset vcf at those loci 
    if(file.exists(paste0(f,".loci.full.vcf.gz"))){
        if(file.mtime(paste0(f,".loci.full.vcf.gz"))>file.mtime("/home/jnrunge/data/trd/mapped_reads/TRD.vcf.gz") & 
          file.mtime(paste0(f,".loci.full.vcf.gz")) > file.mtime(f)){
            next
        }
        
    }
    file.create(running_file<-paste0(f,".running_getLoci"))
    cmd=paste0("bcftools view -R ",paste0(f,".tmp.loci"), " -Oz -o ",paste0(f,".loci.full.vcf.gz")," ~/data/trd/full2489Matrix.vcf.gz")
    sbatch_list=execute_complex_sbatch(c(cmd,
                                         paste0("rm -f ",running_file)), jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = which(files==f), cores="1", mem="4G", time="long", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)

}

if(exists("sbatch_list") &  jobname == "getLoci"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}

while(length(list.files(path = dirname(files[1]), pattern = ".running_getLoci$"))>0){
    Sys.sleep(60)
}


Attaching package: 'tidytable'


The following objects are masked from 'package:stats':

    dt, filter, lag


The following object is masked from 'package:base':

    %in%



Attaching package: 'data.table'


The following objects are masked from 'package:tidytable':

    between, first, fread, last



Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:tidytable':

    across, add_count, add_tally, anti_join, arrange, between,
    bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
    consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
    cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
    first, full_join, group_by, group_cols, group_split, group_vars,
    if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
    lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
    nest_join, nth, percent_rank, pic

[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-1.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-2.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-3.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-4.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-5.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-6.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-7.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-8.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-9.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-10.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-11.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-12.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-13.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-14.sbatch"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/getLoci-

In [182]:
jobname="getAS"

for(f in files){
    
    if(file.exists(paste0(f,".allelesharing.csv.gz"))){
        if(file.mtime(paste0(f,".allelesharing.csv.gz"))>file.mtime("/home/jnrunge/data/trd/mapped_reads/TRD.vcf.gz") & 
          file.mtime(paste0(f,".allelesharing.csv.gz")) > file.mtime(paste0(f,".loci.full.vcf.gz"))){
            next
        }
        
    }

    cmd=paste0("Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R ", f)
    sbatch_list=execute_complex_sbatch(cmd, jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = which(files==f), cores="1", mem="8G", time="long", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)

}

if(exists("sbatch_list") &  jobname == "getAS"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}

while(slurm_check_jobs_still_running("jnrunge","getAS")){
    Sys.sleep(60)
}

[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC1-AF.csv.gz'"
[1] "Submitted batch job 491861"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC2-AF.csv.gz'"
[1] "Submitted batch job 491862"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC3-AF.csv.gz'"
[1] "Submitted batch job 491863"
[1] "sbatch -c 1 --mem=8G --job-name=getAS -p fast -t 119:59:00 --wrap '. ~/activate.sh JupyteR4; Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R /home/jnrunge/data/TRD/results/shiny//ChrisC4-AF.csv.gz'"
[1] "Submitted batch job 491864"


In [None]:
vcf=fread(cmd=paste0("zcat ",paste0(f,".loci.full.vcf.gz")," | grep -v ^##"), data.table = FALSE)
setDF(vcf)

vcf$chrpos=paste(vcf$`#CHROM`,vcf$POS)
df$chrpos=paste(df$chr,df$pos)

getA1A2sharersPerLocus=function(x){
    if(vcf$chrpos[x]%in%df$chrpos){
        which_df_row=which(df$chrpos==vcf$chrpos[x])
        vcf_alleles=strsplit(paste(vcf$REF[x],vcf$`ALT...5`[x],sep=","),",",fixed=TRUE)[[1]]
        df_alleles=strsplit(df$alleles[which_df_row],",",fixed=TRUE)[[1]]
        A1=which(vcf_alleles==df_alleles[as.numeric(df$Allele1[which_df_row])+1])-1 # 0 = REF
        A2=which(vcf_alleles==df_alleles[as.numeric(df$Allele2[which_df_row])+1])-1
        t_vcf=t(vcf[x,10:ncol(vcf)])
        A1_homs=which(substr(t_vcf, 1,1)==A1 & substr(t_vcf, 3,3) == A1)
        A2_homs=which(substr(t_vcf, 1,1)==A2 & substr(t_vcf, 3,3) == A2)
        vcf_return=vcf[x,c(1,2,10:ncol(vcf))]
        vcf_return[1,3:ncol(vcf_return)]="Other"
        vcf_return[1,2+A1_homs]="A1_hom"
        vcf_return[1,2+A2_homs]="A2_hom"
        return(vcf_return)
        #return(tibble(Strain=c(colnames(vcf)[A1_homs+9],colnames(vcf)[A2_homs+9]),
        #              Type=c(rep("A1_hom",length(A1_homs)),
        #                   rep("A2_hom",length(A2_homs)))))
    }else{
        return(NULL)
    }
}

summarise_strains=function(x){
    # x = strain
    return(summarise(group_by(vcf_translated, all_of(x)), p=n()/nrow(vcf_translated))%>%mutate(Strain=x)%>%rename(Type=all_of(x))%>%select(Strain,Type,p))
}

wrapperDfLocus=function(x){
    y=which(vcf$chrpos==df$chrpos[x])
    if(length(y)==0){
        return(NULL)
    }
    return(getA1A2sharersPerLocus(y))
}

vcf_translated=setDT(data.table::rbindlist(lapply(1:nrow(df),wrapperDfLocus)))
colnames(vcf_translated)[colnames(vcf_translated)=="ALT...204"]="ALT"


fwrite(vcf_translated, paste0(f,".allelesharing.csv.gz"))

In [175]:
# in shiny, needs chr pos selection
a=Sys.time()
melted=reshape2::melt(vcf_translated, id.vars = c("#CHROM","POS"))
tmp=summarise(group_by(melted, variable), nAll=n())
vcf_translated_summary=left_join(tmp,summarise(group_by(melted, variable, value), n=n()), by=c("variable"))%>%mutate(p=n/nAll)%>%select(variable,value,p)%>%rename(Strain=variable, Type=value)
head(vcf_translated_summary)
b=Sys.time()
b-a

Strain,Type,p
<fct>,<chr>,<dbl>
AAA,A1_hom,0.6
AAA,A2_hom,0.38
AAA,Other,0.02
AAB,A1_hom,0.57
AAB,A2_hom,0.41
AAB,Other,0.02


Time difference of 0.1242888 secs