* Samtools mpileup (variable sites) of all bams together
* Then load that file if possible
* Make comparisons between all across variable sites (i.e. correlation of AF)
* See if supposedly same samples are best correlated


````bash
ls *markdup.bam > bamlist
samtools mpileup -f /home/jnrunge/data/TRD/R64_nucl.fasta -q 30 -b bamlist --no-output-ins --no-output-del | gzip > all_mpileup.txt.gz
````

In [2]:
source("~/BrusselSprouts/scripts/functions.R")


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




**IF RUN AGAIN, DELETE OLD FILES AND FIGURE OUT MPILEUP LENGTH**


In [4]:
initial_timedate=Sys.time()
jobname="mpileup_prep"
scripts_dir="/home/jnrunge/data/trd/mapped_reads/scripts/"
lines_mpileup=11680783

batch_size=1000
i_size=10

is=ceiling(lines_mpileup/batch_size)

i_min=round(seq(from=0, to=is-i_size, length.out=is/i_size))
i_max=i_min+(c(diff(i_min)-1, i_size))

for(i in 1:length(i_min)){
    if(file.exists(paste0("~/data/trd/mapped_reads/all_mpileup.txt.gz", "-", i_min[i], "-", i_max[i], ".gz"))){
        next
    }
    cmd=paste0("Rscript ~/TRD/01_Mapping/01b_mpileup_prep.r ", i_min[i], " ",i_max[i], " ", batch_size)
    
    
    sbatch_list=execute_complex_sbatch(cmd, jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = i_min[i], cores="1", mem="4G", time="short", env="JupyteR4", initial_timedate = initial_timedate, jobs_simul = 10, jobs_total = 30)
}

if(exists("sbatch_list") &  jobname == "mpileup_prep"){
    print(sbatch_list)
    start_sbatch_list(sbatch_list, 10, jobname, initial_timedate)
}

ERROR: Error in eval(expr, envir, enclos): 


In [None]:
files=list.files("~/data/trd/mapped_reads", "all_mpileup.txt.gz-", full.names =TRUE)

while(length(files)<length(i_min)){
    Sys.sleep(60)
    files=list.files("~/data/trd/mapped_reads", "all_mpileup.txt.gz-", full.names =TRUE)
}

all_mpileup=fread_and_bind_files(files)%>%select(-ncol(.))%>%distinct()

In [None]:
#all_mpileup=fread_and_bind_files(files)%>%select(-ncol(.))

In [None]:
head(all_mpileup)

In [None]:

# Calculate all pairwise correlations


my_corrs <- all_mpileup %>%
  select(4:(ncol(.)-1)) %>%
  gather() %>%
  expand(ID1 = key, ID2 = key, cor = NA) %>%
  filter(ID1 != ID2)

getCor=function(a,b){
    cor(all_mpileup%>%select(all_of(a))%>%
        #mutate(!!a := ifelse(!(!!sym(a) %in% c(0,1)), !!sym(a), NA))%>%
        pull(),all_mpileup%>%select(all_of(b))%>%
        #mutate(!!b := ifelse(!(!!sym(b) %in% c(0,1)), !!sym(b), NA))%>%
        pull(), use = "pairwise.complete.obs")
}

my_corrs=my_corrs%>%rowwise()%>%mutate(cor=getCor(ID1,ID2))

In [None]:
fwrite(my_corrs, "~/data/trd/mapped_reads/all_mpileup.txt.gz-correlated.csv.gz", sep=",")

# <span style="color: white; background-color: red;">Merge decisions based on the correlations</span>

Results below.

* All As and Bs indeed belong together and will be merged.
* Other high correlates are assumed (for now) to be without a TRD signal and hence very similar with fixated similarities plus hovering at 0.5 for the rest.
* JF3x1.m.sort.markdup.bam and YJNRC2.m.sort.markdup.bam will be merged as the result is as expected
* JF4 correlates with the wrong one but also showed no signal, so perhaps just a mis-hit. No merge!
* JG4 merges well with C18 as expected. It had a signal, so should be hard to be a wrong hit. Merge!

In [None]:
my_corrs_unique <- my_corrs %>%
  mutate(ID1_ID2 = paste(pmin(ID1, ID2), pmax(ID1, ID2), sep = "_")) %>%
  distinct(ID1_ID2, .keep_all = TRUE) %>%
  select(ID1, ID2, cor)

my_corrs_unique=my_corrs_unique%>%filter(cor>=0.9)%>%arrange(-cor)

# are all A/B in here
A_IDs=my_corrs%>%select(ID1)%>%filter(grepl("A.m", ID1, fixed=TRUE))%>%pull()%>%unique()
summary(A_IDs%in%(select(my_corrs_unique, ID1)%>%pull())) # yes

my_corrs_unique%>%distinct(ID1, .keep_all = TRUE)

In [98]:
to_merge=my_corrs_unique%>%filter(grepl("A.m", ID1, fixed=TRUE) | ID1 == "JG4.m.sort.markdup.bam" | ID1 == "JF3x1.m.sort.markdup.bam")
to_merge=mutate(to_merge, final_name=paste0(stringr::str_extract(ID2, "(YJNRC\\d+)"),".bam"))
to_merge

renames=data.table(file=naturalsort(unique(c(my_corrs$ID1,my_corrs$ID2))),custom_final_name=NA)%>%filter(!(file %in% c(to_merge$ID1,to_merge$ID2)))
renames <- renames %>%
  mutate(
    custom_final_name = if_else(
      file == "JF4.m.sort.markdup.bam",
      "YJNRC17.bam",
      custom_final_name
    )
  )

renames <- renames %>%
  mutate(
    custom_final_name = case_when(
      file == "YJNRC17.m.sort.markdup.bam" ~ "dontuse_YJNRC17.bam",
      file == "JF4.m.sort.markdup.bam" ~ "YJNRC17.bam",
      TRUE ~ custom_final_name
    )
  )

renames=mutate(renames, final_name = if_else(
      is.na(custom_final_name),
      paste0(str_match(file, "(.*?)\\.m")[, 2], ".bam"),
      custom_final_name
    ))
renames

summary(duplicated(c(to_merge$final_name, renames$final_name)))

ID1,ID2,cor,final_name
<chr>,<chr>,<dbl>,<chr>
YJNRC4A.m.sort.markdup.bam,YJNRC4B.m.sort.markdup.bam,0.9887472,YJNRC4.bam
YJNRC23A.m.sort.markdup.bam,YJNRC23B.m.sort.markdup.bam,0.9858588,YJNRC23.bam
YJNRC27A.m.sort.markdup.bam,YJNRC27B.m.sort.markdup.bam,0.9856123,YJNRC27.bam
YJNRC8A.m.sort.markdup.bam,YJNRC8B.m.sort.markdup.bam,0.9834498,YJNRC8.bam
YJNRC20A.m.sort.markdup.bam,YJNRC20B.m.sort.markdup.bam,0.9829513,YJNRC20.bam
YJNRC5A.m.sort.markdup.bam,YJNRC5B.m.sort.markdup.bam,0.9825846,YJNRC5.bam
YJNRC3A.m.sort.markdup.bam,YJNRC3B.m.sort.markdup.bam,0.9825085,YJNRC3.bam
YJNRC29A.m.sort.markdup.bam,YJNRC29B.m.sort.markdup.bam,0.9818513,YJNRC29.bam
YJNRC21A.m.sort.markdup.bam,YJNRC21B.m.sort.markdup.bam,0.9810434,YJNRC21.bam
YJNRC34A.m.sort.markdup.bam,YJNRC34B.m.sort.markdup.bam,0.9795701,YJNRC34.bam


file,custom_final_name,final_name
<chr>,<chr>,<chr>
ChrisC2.m.sort.markdup.bam,,ChrisC2.bam
ChrisC3.m.sort.markdup.bam,,ChrisC3.bam
ChrisC4.m.sort.markdup.bam,,ChrisC4.bam
ChrisC5.m.sort.markdup.bam,,ChrisC5.bam
ChrisC6.m.sort.markdup.bam,,ChrisC6.bam
ChrisC7.m.sort.markdup.bam,,ChrisC7.bam
ChrisC8.m.sort.markdup.bam,,ChrisC8.bam
JF4.m.sort.markdup.bam,YJNRC17.bam,YJNRC17.bam
YJNRC1.m.sort.markdup.bam,,YJNRC1.bam
YJNRC6.m.sort.markdup.bam,,YJNRC6.bam


   Mode   FALSE 
logical      38 

In [104]:
jobname="merge"

for(i in 1:nrow(to_merge)){
    cmd=paste0("cd ~/data/trd/mapped_reads && samtools merge -o ",to_merge$final_name[i]," ",to_merge$ID1[i]," ",to_merge$ID2[i],"")
    print(cmd)
    
    sbatch_list=execute_complex_sbatch(cmd, jobname = jobname, scripts_dir = scripts_dir, uniqueRunID = basename(to_merge$final_name[i]), cores="1", mem="6gb", time="long", env="bwaetc", initial_timedate = initial_timedate, jobs_simul = 3, jobs_total = 50)
}


start_sbatch_list(sbatch_list, 5, jobname, initial_timedate)


[1] "cd ~/data/trd/mapped_reads && samtools merge -o YJNRC4.bam YJNRC4A.m.sort.markdup.bam YJNRC4B.m.sort.markdup.bam"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/merge-YJNRC4.bam.sbatch"
[1] "cd ~/data/trd/mapped_reads && samtools merge -o YJNRC23.bam YJNRC23A.m.sort.markdup.bam YJNRC23B.m.sort.markdup.bam"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/merge-YJNRC23.bam.sbatch"
[1] "cd ~/data/trd/mapped_reads && samtools merge -o YJNRC27.bam YJNRC27A.m.sort.markdup.bam YJNRC27B.m.sort.markdup.bam"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/merge-YJNRC27.bam.sbatch"
[1] "cd ~/data/trd/mapped_reads && samtools merge -o YJNRC8.bam YJNRC8A.m.sort.markdup.bam YJNRC8B.m.sort.markdup.bam"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/merge-YJNRC8.bam.sbatch"
[1] "cd ~/data/trd/mapped_reads && samtools merge -o YJNRC20.bam YJNRC20A.m.sort.markdup.bam YJNRC20B.m.sort.markdup.bam"
[1] "/home/jnrunge/data/trd/mapped_reads/scripts/merge-YJNRC20.bam.sbatch"
[1] "cd ~/data/trd/mapp

In [105]:
for(i in 1:nrow(renames)){
    file.rename(paste0("~/data/trd/mapped_reads/",renames$file[i]),
               paste0("~/data/trd/mapped_reads/",renames$final_name[i]))
}

In [109]:
stop("All done?")

for(f in naturalsort(unique(c(to_merge$ID1,to_merge$ID2)))){
    print(f)
    file.remove(file.path("~/data/trd/mapped_reads/",f))
}

[1] "JF3x1.m.sort.markdup.bam"
[1] "JG4.m.sort.markdup.bam"
[1] "YJNRC2.m.sort.markdup.bam"
[1] "YJNRC3A.m.sort.markdup.bam"
[1] "YJNRC3B.m.sort.markdup.bam"
[1] "YJNRC4A.m.sort.markdup.bam"
[1] "YJNRC4B.m.sort.markdup.bam"
[1] "YJNRC5A.m.sort.markdup.bam"
[1] "YJNRC5B.m.sort.markdup.bam"
[1] "YJNRC8A.m.sort.markdup.bam"
[1] "YJNRC8B.m.sort.markdup.bam"
[1] "YJNRC18.m.sort.markdup.bam"
[1] "YJNRC20A.m.sort.markdup.bam"
[1] "YJNRC20B.m.sort.markdup.bam"
[1] "YJNRC21A.m.sort.markdup.bam"
[1] "YJNRC21B.m.sort.markdup.bam"
[1] "YJNRC23A.m.sort.markdup.bam"
[1] "YJNRC23B.m.sort.markdup.bam"
[1] "YJNRC27A.m.sort.markdup.bam"
[1] "YJNRC27B.m.sort.markdup.bam"
[1] "YJNRC29A.m.sort.markdup.bam"
[1] "YJNRC29B.m.sort.markdup.bam"
[1] "YJNRC32A.m.sort.markdup.bam"
[1] "YJNRC32B.m.sort.markdup.bam"
[1] "YJNRC34A.m.sort.markdup.bam"
[1] "YJNRC34B.m.sort.markdup.bam"


In [None]:
# maybe recycle this:


# finally, rename BAMs
# need to add sequencing names to Crosses.xlsx
# for the moment:

renaming=data.frame(fastq=NA,bam=basename(bam_files),Jname=NA)
renaming$Jname=c("YJNRC2.bam","YJNRC17.bam","YJNRC18.bam")
renaming
setwd(dirname(bam_files[1]))
for(i in 1:nrow(renaming)){
    cmd=paste0("ln -sf ",renaming$bam[i]," ",renaming$Jname[i])
    print(cmd)
    print(system(command=cmd,intern=TRUE))
    cmd=paste0("ln -sf ",renaming$bam[i],".bai ",renaming$Jname[i],".bai")
    print(cmd)
    print(system(command=cmd,intern=TRUE))
}