# The logic
There is huge amount of files from various runs in different states (raw bcl, demultiplexed, trimmed) on Engram in different folder structures requiring different things to be done to them. Unfortunately, almost every run is a bit of a snowflake in that regard, which has been a hassle over the years. Still, changing what's on Engram is not the best solution as it is best preserved as is. Furthermore, we cannot download everything for space reasons. Instead, we will

* Create empty table of status if needed in the working dir
* Check if run was processed, i.e. is available already as download or as trimmed or as mapped or as final mpileup or has been merged
* Download one run; mark run as downloaded
* Do somewhat individualized procedures to get them all into a trimmed state with the same file names.; mark run as trimmed
* generate BAM; mark as mapped
* QC measures that are saved for each run; mark as QC'd
    * Per sample
        * Coverage
        * Mean MQ
        * % reads max MQ
        * Mean insert size
        * Mean read length
        * % duplicated
* Generate reduced mpileup.xz for each individual (loci of interest, i.e. variable loci that will be used as AHMM input, which will only change if the F0 pipeline is changed / run again with different data); mark as mpileup
    * Sub-folders per run!
    * More QC added to the already existing QC files / rows
        * Effective coverage
        * \# Variants reads (of loci of interest)
* Delete all intermediate files whenever possible; possibly run in batches within run, to keep space low
* Next run

In [65]:
max_cores<-100

In [66]:
# set important data in the config file first!
library(tidytable)
source("../../../config.R")
source("../../../extra-R-functions.R")
# this user needs to have access to engram and google sheet 'REDACTED'

Auto-refreshing stale OAuth token.

[32mv[39m Reading from [36mStatus of mouse sequencing[39m.

[32mv[39m Range '[33m'All founder files'[39m'.

[32mv[39m Reading from [36mStatus of mouse sequencing[39m.

[32mv[39m Range '[33m'All Fx files'[39m'.



In [67]:

get_free_space <- function(path) {
  output <- system2("df", c("-BG", path), stdout = TRUE)
  data <- read.table(text = output, header = FALSE, skip = 1, col.names = c("filesystem", "size", "used", "available", "percent", "mounted_on"))
  free_space_gb <- as.numeric(sub("G", "", data$available))
  return(free_space_gb)
}

# Usage: get_free_space("your_directory_path_here")


standardize_ids <- function(id) {
  parts <- stringr::str_split(id, "[-_]", simplify = TRUE)
  num_part1 <- as.integer(gsub("m", "", parts[1]))
  num_part2 <- as.integer(parts[2])
  standardized_id <- sprintf("m%03d_%02d", num_part1, num_part2)
  return(standardized_id)
}

# Usage: standardize_ids("your_id_here")


In [68]:
updateStatus=function(run, new_status){
    status_df=fread(paste0(folder_for_sequences_fx,"status.tsv"))
    status_df <- status_df %>% mutate(status = ifelse(runID == run, new_status, status))
    fwrite(status_df, paste0(folder_for_sequences_fx,"status.tsv"))
    return(new_status)
}

In [69]:
# Input
run=5 # row from the list

print(paste0("Run ",run))

eng_dir=files_df_fx$`Engram folder`[run]

# get the directory size
# ssh -o "IdentitiesOnly=yes" -i ~/.ssh/engram -t jr3950@engram-xfer-01.rc.zi.columbia.edu "cd mnt/sequencing_data/ && du -sh ."

cmd=paste0('ssh -o "IdentitiesOnly=yes" -i ~/.ssh/engram -t jr3950@engram-xfer-01.rc.zi.columbia.edu "cd ',eng_dir,' && du -sh ."')
dush_output=system(command=cmd, intern=TRUE)

size_unit <- str_extract_all(dush_output, "\\d+|[A-Za-z]")[[1]]
size <- as.numeric(size_unit[1])
unit <- size_unit[2]

size_gb <- switch(tolower(unit),
                  "k" = size / 1000000,
                  "m" = size / 1000,
                  "g" = size,
                  "t" = size * 1000,
                  "p" = size * 1000000,
                  "e" = size * 1000000000)





free_space <- get_free_space(dirname(folder_for_sequences_fx))






# check if this run was already processed

wasThisRunProcessed=function(x){
    if(dir.exists(folder_for_sequences_fx)){
        if(file.exists(paste0(folder_for_sequences_fx,"status.tsv"))){
            status=fread(paste0(folder_for_sequences_fx,"status.tsv"))
            if(run %in% status$runID){
                switch(status$status[status$runID==run],
                   "MpileupDone" = {
                     return(TRUE)
                   },
                   {
                     return(FALSE)
                   })
            }else{
                fwrite(bind_rows(status,data.table(runID=run, status="Initialized")), paste0(folder_for_sequences_fx,"status.tsv"))
                return(FALSE)
            }
        }else{
            fwrite(data.table(runID=run, status="Initialized"), paste0(folder_for_sequences_fx,"status.tsv"))
            return(FALSE)
        }
    }else{
        dir.create(folder_for_sequences_fx)
        fwrite(data.table(runID=run, status="Initialized"), paste0(folder_for_sequences_fx,"status.tsv"))
        return(FALSE)
    }
}


if(free_space > (50*size_gb)){
    print("Enough space available")
    
    if(wasThisRunProcessed(run)==TRUE){
        
        print("Run seems to be processed already, skipping...")
        
    }else{
        
        status_df=fread(paste0(folder_for_sequences_fx,"status.tsv"))
        status=status_df$status[status_df$runID==run]
        
        
        print(paste("Run status:", status))
        
        
    }
}else{
    stop("Make more space on HDD")
}

[1] "Run 5"
[1] "Enough space available"
[1] "Run status: Initialized"


In [70]:
# Download all files from run
runDir<-paste0(folder_for_sequences_fx,"run",run)
bam_dir=file.path(runDir,"bam")

if(status=="Initialized"){
    if(!dir.exists(runDir)){
        dir.create(runDir)
    }
    cmd=paste0("rsync -avzPhe \"ssh -o 'IdentitiesOnly=yes' -i /moto/home/",columbia_username,"/.ssh/engram\" ",columbia_username,"@engram-xfer-01.rc.zi.columbia.edu:",files_df_fx$`Engram folder`[run],"/*",files_df_fx$`File ending`[run]," ",runDir,"/")
    
    if(files_df_fx%>%filter(run)%>%select(Type)%>%pull()=="RawBCL"){
        dir.create(paste0(runDir,"/BCL/"))
        cmd=paste0("rsync -avzPhe \"ssh -o 'IdentitiesOnly=yes' -i /moto/home/",columbia_username,"/.ssh/engram\" ",columbia_username,"@engram-xfer-01.rc.zi.columbia.edu:",files_df_fx$`Engram folder`[run],"/*"," ",runDir,"/BCL/")
    }
    
    if(files_df_fx%>%filter(run)%>%select(`Need to download more than gz`)%>%pull()==TRUE){
        dir.create(paste0(runDir,"/BCL/"))
        cmd=paste0("rsync -avzPhe \"ssh -o 'IdentitiesOnly=yes' -i /moto/home/",columbia_username,"/.ssh/engram\" ",columbia_username,"@engram-xfer-01.rc.zi.columbia.edu:",files_df_fx$`Engram folder`[run],"/*"," ",runDir,"/Download/")
    }
    
    cmd_return<-system(command=cmd, intern=TRUE)
    print(tail(cmd_return))
    
    if(files_df_fx%>%filter(run)%>%select(`Need to download more than gz`)%>%pull()==TRUE){
    
        cmd_rename=paste0("mv -f ",runDir,"/",basename(files_df_fx$`Engram folder`[run]), " ",folder_for_sequences_fx,"run",run,"_tmp && rm -rf ",runDir," && mv -f ",folder_for_sequences_fx,"run",run,"_tmp ",runDir,"")
    
        print(cmd_rename)
    
        stop("check if it works next time")
        
    }
    
    status=updateStatus(run, "Downloaded")
}

# change status

[1] "\r              0   0%    0.00kB/s    0:00:00  \r         24.17K 100%   42.22kB/s    0:00:00 (xfr#383, to-chk=1/384)"                                                                                                                                             
[2] "m76_9_S300_R1_001.fastq.gz"                                                                                                                                                                                                                                       
[3] "\r              0   0%    0.00kB/s    0:00:00  \r         12.44M  16%   11.86MB/s    0:00:05  \r         38.01M  49%   18.13MB/s    0:00:02  \r         68.67M  90%   21.77MB/s    0:00:00  \r         76.27M 100%   22.18MB/s    0:00:03 (xfr#384, to-chk=0/384)"
[4] ""                                                                                                                                                                                                          

In [71]:
# BCLtoFastq if needed

cores<-max_cores
cores_rw<-round(0.5*max_cores)

if(files_df_fx%>%filter(run)%>%select(Type)%>%pull()=="RawBCL"){
    if(status=="Downloaded")
        {
        cmd=paste0("cd ",runDir, " && sh -xe ", Barn_Mice_dir, "01_Genotyping/02_Fx/01_ReadstoMpileup/01_BCL2FASTQ.sh ",runDir,"/BCL/",files_df_fx$`BCL dir`[run],
              " ", runDir, " BCL/", files_df_fx$`BCL2Fastq CSV`[run], " ",cores," ",cores_rw,"")
        print(cmd)
        execute_cmd_sbatch(cmd, jobname="BCL2Fastq", env=env_mapping_etc,cpu="16", activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                            time="long", mem="32G", acc=slurm_acc)
        Sys.sleep(5)
    }
}

In [72]:
if(exists("jobname")){
    if(jobname=="BCL2Fastq"){
        while(slurm_check_jobs_still_running(columbia_username,jobname)){
            Sys.sleep(60)
        }
        status=updateStatus(run, "BCL2FastqDone")

}
}


if you want to diagnose why there are undetermined files, you can check the frequency of "N" in the barcode like this

````bash

zcat Undetermined_S0_L001_R1_001.fastq.gz | awk 'NR%4==1' | cut -f 2 -d' ' | cut -f 4 -d":" | grep -o 'N' | wc -l
zcat Undetermined_S0_L001_R1_001.fastq.gz | awk 'NR%4==1' | wc -l

````

then divide the numbers and multiply by 16

In [73]:
if(status=="BCL2FastqDone"){
    # now we will have a bunch of fastq files. 
    # we may have multiple files per sample due to having multiple lanes
    # is that the case here?
    fastq_files<-list.files(runDir, "fastq.gz$",full.names = TRUE)
    if(sum(duplicated(str_replace(fastq_files,"[L][0-9]*","L000")))>0){
        for(f in fastq_files[grepl("L001",fastq_files, fixed=TRUE)]){
            all_files_of_that_sample<-fastq_files[str_replace(fastq_files,"[L][0-9]*","L000")==str_replace(f,"[L][0-9]*","L000")]
            cmd<-paste0("cat ", paste(all_files_of_that_sample, collapse=" "), " > ", str_replace(f,"[L][0-9]*","Lmerged"),
                       " && ", "rm -f ", paste(all_files_of_that_sample, collapse=" "))
            #print(cmd)
            system(command=cmd, intern=TRUE)
        }
    }
}

In [75]:
initial_timedate=Sys.time()

trimmed_dir<-paste0(runDir, "/trimmed")
if(!dir.exists(trimmed_dir)){
    dir.create(trimmed_dir)
}

if(status%in%c("Downloaded","Renamed","BCL2FastqDone"))
    {
    # First, we need to remove files that dont belong there, for example if the lane or directory is not just our mice

    files_df_fx%>%filter(run)%>%select(Comment)%>%pull()

    # rename all files into the mXXX-YY.file format, to never deal with that again
    
    if(status!="Renamed"){
        FileNameRule=files_df_fx%>%filter(run)%>%select(`ID From`)%>%pull()

        files=list.files(path = runDir, pattern=paste0(files_df_fx$`File ending`[run],"$"), full.names = TRUE)
        files<-files[!grepl("Undetermined",files,fixed=TRUE)]

        if(files_df_fx%>%filter(run)%>%select(`Paired End`)%>%pull()==TRUE){
            stop("not implemented, watch out when renaming & fixfastq / trimming!!")
        }

        switch(FileNameRule, 
               'File Name before "_"'={
                   base_files_new <- sub("_.*", "", basename(files))
                   base_files_new = unlist(lapply(base_files_new, standardize_ids))
                   files_new=paste0(runDir,"/",base_files_new, ".",files_df_fx$`File ending`[run])
               },
               'File Name before second "_"'={
                   base_files_new <- sub("^(.*?_.*?)_.*", "\\1", basename(files))
                   base_files_new = unlist(lapply(base_files_new, standardize_ids))
                   files_new=paste0(runDir,"/",base_files_new, ".",files_df_fx$`File ending`[run])
               },
               'File Name before "_S"'={
                   base_files_new <- sub("_S.*", "", basename(files))
                   base_files_new = unlist(lapply(base_files_new, standardize_ids))
                   files_new=paste0(runDir,"/",base_files_new, ".",files_df_fx$`File ending`[run])
               },
               stop("Not specified")
               )

        for(i in 1:length(files)){
                       print(paste(files[i],"-->",files_new[i]))
                        file.rename(files[i],files_new[i])

                   }
        status=updateStatus(run = run, new_status = "Renamed")
    }

    

    # trim if needed, otherwise just mv to trimmed dir
    
    if(files_df_fx%>%filter(run)%>%select(Type)%>%pull()=="Trimmed"){
        dir.create(trimmed_dir)
        
        # List all files ending with ".txt" in "dir1"
        files=list.files(path = runDir, pattern=paste0(files_df_fx$`File ending`[run],"$"), full.names = TRUE)

        # Generate new file paths
        files_new <- file.path(trimmed_dir, basename(files))

        for(i in 1:length(files)){
                       print(paste(files[i],"-->",files_new[i]))
                       file.rename(files[i],files_new[i])

                   }

        status=updateStatus(run = run, new_status = "Trimmed")
    }else{
        
        files=list.files(path = runDir, pattern=paste0(files_df_fx$`File ending`[run],"$"), full.names = TRUE)
        files<-files[!grepl("Undetermined",files,fixed=TRUE)]
        
        # single end
        for(f in files){
            cmd=paste0("cd ", runDir, " && ",
                   "Rscript ",jnr_general_scripts_dir,"FixTooShortFastq.R ",f," ",jnr_general_scripts_dir," && ", 
                   ". ",Barn_Mice_dir,"activateEnv.sh ", env_mapping_etc," && trim_galore -o trimmed -j 1 ",f)
            
            #print(cmd)
            #stop()
            sbatch_list=execute_complex_sbatch(list_of_cmds = cmd,jobname = jobname<-"fx_trim",
                                               scripts_dir = paste0(runDir,"/scripts"), 
                                               uniqueRunID = str_replace_all(basename(f), fixed("."), "-"),cores = "1",mem = "6G",time = "short",
                                               env = env_jupyter, initial_timedate = initial_timedate, 
                                               jobs_simul = max_cores,list_of_additional_flags=c(paste0("-A ",slurm_acc)),
                                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                                              Execute_Sbatches_Env=env_jupyter,username=columbia_username)
        }
        
    }

    if(exists("sbatch_list") & exists("jobname")){
    if(jobname=="fx_trim"){
        start_sbatch_list(sbatch_list, max_cores, jobname, initial_timedate, columbia_username)
        while(slurm_check_jobs_still_running(columbia_username,jobname)){
            Sys.sleep(60)
        }

}
}


    # update status
    files=list.files(path = runDir, pattern=paste0(files_df_fx$`File ending`[run],"$"), full.names = TRUE)
    trimmed_files=list.files(path = trimmed_dir, pattern=".gz$", full.names = TRUE)
    if(length(files)==length(trimmed_files)){
        status=updateStatus(run, "Trimmed")
    }else{
        stop("Not all files trimmed! Investigation needed.")
    }

   

}

[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_1_S196_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_01.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_10_S205_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_10.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_11_S206_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_11.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_12_S207_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_12.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_13_S208_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_13.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_14_S209_R1_001.fastq.gz --> /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m033_14.fastq.gz"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/m33_15_S210_R1_001.fastq.gz -

"running command 'Rscript ~/github/general/Execute_Sbatches.R '2023-05-17 10:50:54' /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/scripts/fx_trim.list 100 fx_trim jr3950 999' had status 1"


[1] "[1] \"Max Sbatches Total set. Checking...\""                                                                         
[2] "[1] \"Currently running 0 jobs. Can schedule 101 more.\""                                                            
[3] "[1] \"384 sbatches remaining! Running 101 more...\""                                                                 
[4] "[1] \"Checking /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run5/scripts/fx_trim-m033_01-fastq-gz.sbatch.inprogress\""
attr(,"status")
[1] 1
attr(,"errmsg")
[1] "Resource temporarily unavailable"


ERROR: Error in eval(expr, envir, enclos): Not all files trimmed! Investigation needed.


In [None]:
files=list.files(path = runDir, pattern=paste0(files_df_fx$`File ending`[run],"$"), full.names = TRUE)
    trimmed_files=list.files(path = trimmed_dir, pattern=".gz$", full.names = TRUE)
    if(length(files)==length(trimmed_files)){
        status=updateStatus(run, "Trimmed")
    }else{
        stop("Not all files trimmed! Investigation needed.")
    }

In [46]:
initial_timedate=Sys.time()

if(status=="Trimmed"){
    
    bam_dir=file.path(runDir,"bam")
    dir.create(bam_dir)
    
    if(files_df_fx%>%filter(run)%>%select(`Paired End`)%>%pull()==TRUE){
            stop("not implemented, watch out")
        }
    
    
    files=list.files(path = trimmed_dir, pattern = ".gz$", full.names = TRUE)
    
    for(f in files){
        ID=strsplit(basename(f), ".", fixed = TRUE)[[1]][1]
        ID=str_replace(ID, fixed("_trimmed"), "")
        end_bam=file.path(bam_dir,paste0(ID,".bam"))
        print(end_bam)
        if(!file.exists(end_bam)){
            # input: jupyter_env samtools_env reference individualID fastQ1 fastQ2
            cmd=paste0("cd ",bam_dir," && sh -xe ",Barn_Mice_dir,"01_Genotyping/02_Fx/01_ReadstoMpileup/01_Mapping.sh ",
                      env_jupyter, " ", env_mapping_etc, " ", ref_file, " ", ID, " ", f)

            sbatch_list=execute_complex_sbatch(list_of_cmds = cmd,jobname = jobname<-"map-fx",
                                               scripts_dir = paste0(bam_dir,"/scripts"), 
                                               uniqueRunID = ID,cores = "1",mem = "16G",time = "short",
                                               env = env_mapping_etc, initial_timedate = initial_timedate, 
                                               jobs_simul = max_cores,list_of_additional_flags=c(paste0("-A ",slurm_acc)),
                                              activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                                              Execute_Sbatches_Env=env_jupyter,username=columbia_username)
            
            
    }
    
    
}
    
    }

[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_01.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_01.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_02.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_02.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_03.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_03.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_04.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_04.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_05.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_05.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/m032_06.bam"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_06.sbatch"
[1] "/moto/ziab/users/jr3950/data/genome

In [47]:
if(exists("sbatch_list") & exists("jobname")){
    if(jobname=="map-fx"){
        start_sbatch_list(sbatch_list, max_cores, jobname, initial_timedate,columbia_username)
        while(slurm_check_jobs_still_running(columbia_username,jobname)){
            Sys.sleep(60)
        }

}
}


[1] "384 /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx.list"
  [1] "[1] \"Max Sbatches Total set. Checking...\""                                                                   
  [2] "[1] \"Currently running 0 jobs. Can schedule 101 more.\""                                                      
  [3] "[1] \"384 sbatches remaining! Running 101 more...\""                                                           
  [4] "[1] \"Checking /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_01.sbatch.inprogress\""
  [5] "[1] \"Currently running 0 jobs. Can schedule 101 more.\""                                                      
  [6] "[1] \"Submitted batch job 13727648\""                                                                          
  [7] "[1] \"Checking /moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/map-fx-m032_02.sbatch.inprogress\""
  [8] "[1] \"Currently running 1 jobs. Can schedule 100 more.\""                    

In [48]:
trimmed_dir_files_remaining=list.files(path = trimmed_dir, pattern = ".gz$")
if(length(trimmed_dir_files_remaining)>0){
    stop(paste0("Run ", run, " encountered an issue: not all trimmed files were removed after mapping jobs were done; likely some mappings failed. Maybe out of RAM?"))
}else{
    status=updateStatus(run, "Mapped")
}



In [49]:
# get QC measures for each run

# each bam file gets a row

# I am running 01_BAM-QC.sh on each
initial_timedate=Sys.time()

if(status=="Mapped"){
    if(!file.exists(QC_file<-paste0(folder_for_sequences_fx,"QC.tsv"))){
        writeLines("Sample\tDate\tCoverage\tMeanMQw0\tMeanMQwo0\tMQ30plusPercent\tMeanInsertSize\tMeanReadLength\tDuplicationPercent", QC_file)
    }
    bam_dir=file.path(runDir,"bam")
    bam_files<-list.files(path = bam_dir, pattern=".bam$", full.names = TRUE)

    bam_files=str_replace(bam_files, fixed(folder_for_sequences_fx), "") # relative paths

    QC_df<-fread(QC_file, header=TRUE)

    head(QC_df)

    for(bam in bam_files){
        if(nrow(QC_df)>0){
            if(bam %in% (QC_df%>%select(Sample)%>%pull()))
                {
                stop("check for timestamp")
                next
            }
        }
        cmd<-paste0("sh ",Barn_Mice_dir,"01_Genotyping/02_Fx/01_ReadstoMpileup/01_BAM-QC.sh ", folder_for_sequences_fx, " ", bam, " ", QC_file)
        sbatch_list=execute_complex_sbatch(list_of_cmds = cmd,jobname = jobname<-"QC-bam-fx",
                                                   scripts_dir = paste0(bam_dir,"/scripts"), 
                                                   uniqueRunID = str_replace_all(bam, fixed("/"), "-"),cores = "1",mem = "4G",time = "short",
                                                   env = env_mapping_etc, initial_timedate = initial_timedate, 
                                                   jobs_simul = max_cores,jobs_total=max_cores,list_of_additional_flags=c(paste0("-A ",slurm_acc)),
                                                  activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                                                  Execute_Sbatches_Env=env_jupyter,username=columbia_username)
    }

    if(exists("sbatch_list") & exists("jobname")){
        if(jobname=="QC-bam-fx"){
            start_sbatch_list(sbatch_list, max_cores, jobname, initial_timedate, columbia_username, max_cores)


    }
    }

}



[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_01.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_02.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_03.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_04.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_05.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_06.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_07.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_08.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/QC-bam-fx-run4-bam-m032_09.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/b

In [50]:
initial_timedate=Sys.time()


# mpileup at loci of interest

if(status=="Mapped" | status=="BAM-QCed"){
    bam_dir=file.path(runDir,"bam")
    bam_files<-list.files(path = bam_dir, pattern=".bam$", full.names = TRUE)
    bam_files=str_replace(bam_files, fixed(folder_for_sequences_fx), "") # relative paths
    
    for(b in bam_files){
        if(file.exists(paste0(folder_for_sequences,b,".mpileup.xz"))){
            next
        }
        cmd=paste0("cd ",folder_for_sequences_fx," && samtools mpileup -s -l ",folder_for_sequences,"Founders.filtered.posfile_for_mpileup ",
           "-f ",ref_file," ",b," | xz -T 1 > ",b,".mpileup.xz")
        sbatch_list=execute_complex_sbatch(list_of_cmds = cmd,jobname = jobname<-"mpileup-fx",
                                                   scripts_dir = paste0(bam_dir,"/scripts"), 
                                                   uniqueRunID = str_replace_all(b, fixed("/"), "-"),cores = "1",mem = "4G",time = "short",
                                                   env = env_mapping_etc, initial_timedate = initial_timedate, 
                                                   jobs_simul = max_cores,jobs_total=max_cores,list_of_additional_flags=c(paste0("-A ",slurm_acc)),
                                                  activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                                                  Execute_Sbatches_Env=env_jupyter,username=columbia_username)
    }

    if(exists("sbatch_list") & exists("jobname")){
        if(jobname=="mpileup-fx"){
            start_sbatch_list(sbatch_list, max_cores, jobname, initial_timedate, columbia_username, max_cores)


    }
    }
}

[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_01.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_02.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_03.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_04.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_05.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_06.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_07.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_08.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/mpileup-fx-run4-bam-m032_09.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_

In [51]:
initial_timedate=Sys.time()

# get a per-chromosome good mapped reads count for later sexing
bam_files<-list.files(path = bam_dir, pattern=".bam$", full.names = TRUE)

for(b in bam_files){
    if(file.exists(paste0(b,".chrs.readCount.tsv.gz"))){
        next
    }
    cmd=paste0("sh -xe ",Barn_Mice_dir,"01_Genotyping/02_Fx/01_ReadstoMpileup/01_SexingPrep.sh ", b, " ", b, ".chrs.readCount.tsv")
    sbatch_list=execute_complex_sbatch(list_of_cmds = cmd,jobname = jobname<-"fx-sexing-prep",
                                                   scripts_dir = paste0(bam_dir,"/scripts"), 
                                                   uniqueRunID = str_replace_all(b, fixed("/"), "-"),cores = "1",mem = "4G",time = "short",
                                                   env = env_mapping_etc, initial_timedate = initial_timedate, 
                                                   jobs_simul = max_cores,jobs_total=max_cores,list_of_additional_flags=c(paste0("-A ",slurm_acc)),
                                                  activateEnvScript=paste0(Barn_Mice_dir,"activateEnv.sh"),
                                                  Execute_Sbatches_Env=env_jupyter,username=columbia_username)
    }

    if(exists("sbatch_list") & exists("jobname")){
        if(jobname=="fx-sexing-prep"){
            start_sbatch_list(sbatch_list, max_cores, jobname, initial_timedate, columbia_username, max_cores)


    }
    }

[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_01.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_02.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_03.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_04.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_05.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-prep--moto-ziab-users-jr3950-data-genomes-tmp_Fx-run4-bam-m032_06.bam.sbatch"
[1] "/moto/ziab/users/jr3950/data/genomes/tmp_Fx/run4/bam/scripts/fx-sexing-

In [52]:
# wait for both mpileup and bam QC jobs

while(slurm_check_jobs_still_running(columbia_username,"QC-bam-fx")){
            Sys.sleep(60)
        }
while(slurm_check_jobs_still_running(columbia_username,"mpileup-fx")){
            Sys.sleep(60)
        }

while(slurm_check_jobs_still_running(columbia_username,"fx-sexing-prep")){
            Sys.sleep(60)
        }

In [60]:
mpileup_files<-list.files(path = bam_dir, pattern=".mpileup.xz$", full.names = TRUE)
if(length(mpileup_files)!=length(bam_files)){
    stop(paste0(run,": unexpectedly missing mpileup files"))
}
status<-updateStatus(run, "Mpileup")

chrs_files<-list.files(path = bam_dir, pattern="chrs.readCount.tsv.gz$", full.names = TRUE)
if(length(chrs_files)!=length(bam_files)){
    stop(paste0(run,": unexpectedly missing chrs files"))
}
QC_df<-fread(QC_file, header=TRUE, fill=TRUE, sep="\t")
count_unique_files_in_this_run=QC_df%>%filter(startsWith(Sample, paste0("run",run)))%>%select(Sample)%>%distinct()%>%nrow()

if(count_unique_files_in_this_run!=length(bam_files)){
    stop(paste0(run,": unexpectedly missing QC data"))
}
status<-updateStatus(run, "BAM-QCed")


In [62]:
# mpileup QC

# Effective coverage (number of loci in mpileup / number loci that could have been in mpileup)

# fast enough to just run in here
if(status=="BAM-QCed"){
    

if(!("CountLinesMpileup" %in% colnames(QC_df))){
    QC_df$CountLinesMpileup=NA
}
for(m in mpileup_files){
    sample_name_qc_df<-str_replace(m, fixed(folder_for_sequences_fx), "")
    sample_name_qc_df<-str_replace(sample_name_qc_df, fixed(".mpileup.xz"), "")
    if(!(sample_name_qc_df %in% (select(QC_df, Sample)%>%pull()))){
        stop(paste0(run,": missing data for sample ",sample_name_qc_df," in QC_df"))
    }
    lines_mpileup<-as.numeric(system(command=paste0("xzcat ",m, " | wc -l"), intern=TRUE))
    QC_df<-mutate(QC_df, CountLinesMpileup=ifelse(Sample==sample_name_qc_df, lines_mpileup, CountLinesMpileup))
}
    tail(QC_df)
    fwrite(QC_df, QC_file, sep="\t")
    status=updateStatus(run,"Mpileup_QCed")
    }

In [63]:
status

In [64]:
# mark as mpileup
if(status=="Mpileup_QCed"){
status=updateStatus(run, "MpileupDone")

# remove "BCL" dir if it exists
if(dir.exists(paste0(runDir,"/BCL"))){
    unlink(paste0(runDir,"/BCL"), recursive=TRUE)
}

# remove *gz and *gziptest files in the runDir
gz_gztest_files=c(list.files(runDir, pattern = ".gz$",full.names = TRUE),list.files(runDir, pattern = ".gziptest$",full.names = TRUE))
if(length(gz_gztest_files)>0){
    file.remove(gz_gztest_files)
}
# remove trimmed dir
if(dir.exists(paste0(runDir,"/trimmed"))){
    unlink(paste0(runDir,"/trimmed"), recursive=TRUE)
}

# remove *bam and *bai in bam dir
bam_bai_files=c(list.files(paste0(runDir,"/bam"), pattern = ".bam$",full.names = TRUE),list.files(paste0(runDir,"/bam"), pattern = ".bai$",full.names = TRUE))
if(length(bam_bai_files)>0){
    file.remove(bam_bai_files)
}

# scripts dirs
if(dir.exists(paste0(runDir,"/scripts"))){
    unlink(paste0(runDir,"/scripts"), recursive=TRUE)
}
if(dir.exists(paste0(runDir,"/bam/scripts"))){
    unlink(paste0(runDir,"/bam/scripts"), recursive=TRUE)
}
    }