I want to get a list of strains that share the allele1 or allele2 per locus per cross so that I can dynamically explore the phylogeny in this regard

i.e. which strains share haplotypes across the genome, of course particularly at TRD loci

In [1]:
library(tidytable)
source("~/BrusselSprouts/scripts/functions.R")

files=list.files(path = "/home/jnrunge/data/TRD/results/shiny", pattern="AF.csv.gz$", full.names = TRUE)

initial_timedate=Sys.time()
jobname="getLoci"
scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"

for(f in files){
    df=fread(f,data.table=TRUE)
    fwrite(select(df,chr,pos),paste0(f,".tmp.loci"), sep="\t", col.names = FALSE)
    # make a subset vcf at those loci 
    if(file.exists(paste0(f,".loci.full.vcf.gz"))){
        if(file.mtime(paste0(f,".loci.full.vcf.gz"))>file.mtime("/home/jnrunge/data/trd/mapped_reads/TRD.vcf.gz") & 
          file.mtime(paste0(f,".loci.full.vcf.gz")) > file.mtime(f)){
            next
        }
        
    }
    file.create(running_file<-paste0(f,".running_getLoci"))
    cmd=paste0("bcftools view -R ",paste0(f,".tmp.loci"), " -Oz -o ",paste0(f,".loci.full.vcf.gz")," ~/data/trd/full2489Matrix.vcf.gz")
    sbatch_list=execute_complex_sbatch(c(cmd,
                                         paste0("rm -f ",running_file)), jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = which(files==f), cores="1", mem="4G", time="long", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)

}

if(exists("sbatch_list") &  jobname == "getLoci"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}

while(length(list.files(path = dirname(files[1]), pattern = ".running_getLoci$"))>0){
    Sys.sleep(60)
}


Attaching package: 'tidytable'


The following objects are masked from 'package:stats':

    dt, filter, lag


The following object is masked from 'package:base':

    %in%



Attaching package: 'data.table'


The following objects are masked from 'package:tidytable':

    between, first, fread, last



Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:tidytable':

    across, add_count, add_tally, anti_join, arrange, between,
    bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
    consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
    cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
    first, full_join, group_by, group_cols, group_split, group_vars,
    if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
    lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
    nest_join, nth, percent_rank, pic

In [2]:
initial_timedate=Sys.time()

for(f in files){
    
    if(file.exists(paste0(f,".allelesharing.csv.gz"))){
        if(file.mtime(paste0(f,".allelesharing.csv.gz"))>file.mtime("/home/jnrunge/data/trd/mapped_reads/TRD.vcf.gz") & 
          file.mtime(paste0(f,".allelesharing.csv.gz")) > file.mtime(paste0(f,".loci.full.vcf.gz"))){
            next
        }
        
    }
    jobname="getAS"
    file.create(running_file<-paste0(f,".running_getAS"))
    cmd=paste0("Rscript ~/TRD/03_GenomicSignals/01_getAlleleSharingStrains.R ", f)
    sbatch_list=execute_complex_sbatch(c(cmd,paste0("rm -f ",running_file)), jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = which(files==f), cores="1", mem="8G", time="long", env="JupyteR4", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)

}

if(exists("sbatch_list") &  jobname == "getAS"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}

while(length(list.files(path = dirname(files[1]), pattern = ".running_getAS$"))>0){
    Sys.sleep(60)
}