## Filter and merge BAMs from batches of single-end scGRO-seq datasets
Data will be consolidated into a uniform GenomicRanges format while preserving
experiment ID and cell barcodes

In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(plyranges)
    library(GenomicFiles)
    library(GenomicAlignments)
    library(rtracklayer)
    library(dtplyr)
    library(data.table)
    library(parallel)
    library(doParallel)
    library(ggridges)
});

In [None]:
setDTthreads(threads = 15);
registerDoParallel(15);
options(
    repr.plot.width=6,
    repr.plot.height=4,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic());

In [None]:
PE_path = "/net/bmc-lab2/data/lab/scgroseq/group/scGROseq_consolidated/v2/v2p8_ssAdSpC3_PE/"
PE_exp = list(
    "Exp230_scGROseq_c04",
    "Exp234_scGROseq_c09",
    "Exp236_scGROseq_c02", 
    "Exp236_scGROseq_c03",
    "Exp236_scGROseq_c05",
    "Exp236_scGROseq_c06",
    "Exp236_scGROseq_c07",
    "Exp236_scGROseq_c08",
    "Exp236_scGROseq_c13",
    "Exp236_scGROseq_c14",
    "Exp236_scGROseq_c15",
    "Exp236_scGROseq_c16",
    "Exp246_scGROseq_c03",
    "Exp246_scGROseq_c04",
    "Exp246_scGROseq_c08",
    "Exp256_scGROseq_c09",
    "Exp260_scGROseq_c06",
    "Exp260_scGROseq_c08",
    "Exp260_scGROseq_c10",
    "Exp261_scGROseq_c11",
    "Exp263_scGROseq_c02",
    "Exp263_scGROseq_c03",
    "Exp263_scGROseq_c04",
    "Exp263_scGROseq_c08",
    "Exp263b_scGROseq_c04",
    "Exp263b_scGROseq_c07",
    "Exp263b_scGROseq_c10",
    "Exp264a_scGROseq_c04",
    "Exp264a_scGROseq_c05",
    "Exp264a_scGROseq_c06",
    "Exp264a_scGROseq_c07",
    "Exp264a_scGROseq_c10",
    "Exp264a_scGROseq_c14",
    "Exp264a_scGROseq_c16"
);

In [None]:
SE_path = "/net/bmc-lab2/data/lab/scgroseq/group/scGROseq_consolidated/v2/v2p8_ssAdSpC3_SE/"
SE_exp = list(
    "Exp260b_scGROseq_c11",
    "Exp260b_scGROseq_c12",
    "Exp260b_scGROseq_c13",
    "Exp260b_scGROseq_c14",
    "Exp260b_scGROseq_c15",
    "Exp260b_scGROseq_c16",
    "Exp236_scGROseq_c05",
    "Exp236_scGROseq_c06",
    "Exp236_scGROseq_c07",
    "Exp236_scGROseq_c08"
);

In [None]:
ssAd_path = "/net/bmc-lab2/data/lab/scgroseq/group/scGROseq_consolidated/v2/v2p8_ssAd/"
ssAd_exp = list(
    "Exp211_scGROseq_c10",
    "Exp211_scGROseq_c12",
    "Exp211_scGROseq_c17",
    "Exp211_scGROseq_c18",
    "Exp208_scGROseq_c11",
    "Exp93b_scGROseq_c04"
);

In [None]:
hpAd_path = "/net/bmc-lab2/data/lab/scgroseq/group/scGROseq_consolidated/v2/v2p8_hpAd/"
hpAd_exp = list(   
    "Exp149_scGROseq_c01",
    "Exp149_scGROseq_c07",
    "Exp149_scGROseq_c09",
    "Exp153_scGROseq_c01",
    "Exp153_scGROseq_c02",
    "Exp153_scGROseq_c03",
    "Exp153_scGROseq_c04",
    "Exp153_scGROseq_c11",
    "Exp156_scGROseq_c09",
    "Exp156_scGROseq_c13",
    "Exp160_scGROseq_c02",
    "Exp160_scGROseq_c03",
    "Exp160_scGROseq_c04",
    "Exp160_scGROseq_c07",
    "Exp160_scGROseq_c08",
    "Exp160_scGROseq_c10",
    "Exp160_scGROseq_c11",
    "Exp160_scGROseq_c12",
    "Exp160_scGROseq_c13",
    "Exp160_scGROseq_c14",
    "Exp160_scGROseq_c16",
    "Exp168_scGROseq_c02",
    "Exp168_scGROseq_c06",
    "Exp168_scGROseq_c13",
    "Exp179_scGROseq_c02",
    "Exp183_scGROseq_c03",
    "Exp208_scGROseq_c01",
    "Exp211_scGROseq_c08"
);

### Process the BAM files
1. Each file name is used to generate an experiment ID (exp_ID)
2. Query (read) name and mapq info is read
3. Swap strands to account for adapter design (read1 from RNA 3' end).
4. Collapse reads to 3' end of RNA 
5. Assign exp_ID so we can merge experiments later
6. Extract cellBC from query name using str_match regular expression
7. Discard excess info to conserve memory

In [None]:
process_BAM = function(fpath, fname, suffix) {
    bam_file = paste0( fpath, fname, "_align/", fname, suffix );
    exp_ID = sub("scGROseq_", "", fname, fixed=T);

    alignments = read_bam( bam_file ) %>%
        select( qname, mapq ) %>%
        # filter( str_match( cigar, "^(\\d+)S" )[,2] %in% c(NA, "1", "2", "3", "4", "5", "6") ) %>%
        # filter( str_match( cigar,  "(\\d+)S$" )[,2] %in% c(NA, "1", "2", "3", "4", "5", "6") ) %>%
        mutate( strand = ifelse( strand == "+", "-", "+" ) ) %>%
        mutate( PolPos = ifelse( strand == "+", end, start) ) %>%
        mutate( ExpID = exp_ID ) %>%
        mutate( BC_UMI = str_match( qname, "_(\\w+_\\w+)$" )[,2] );
    alignments$qname=NULL;
    alignments$qwidth=NULL;
    alignments$cigar=NULL;
    alignments$njunc=NULL;
    return( alignments );
}

Use 'multicore list apply' to process each BAM in parallel

In [None]:
scGRO_PE = mclapply(
    fpath=PE_path,
    PE_exp,
    suffix="_merged2pass_dedup.bam",
    mc.cores=15,
    process_BAM
);

In [None]:
scGRO_SE = mclapply(
    fpath=SE_path,
    SE_exp,
    suffix="_merged2pass_dedup.bam",
    mc.cores=15,
    process_BAM
);

In [None]:
scGRO_hpAd = mclapply(
    fpath=hpAd_path,
    hpAd_exp,
    suffix="_merged2pass_dedup.bam",
    mc.cores=15,
    process_BAM
);

In [None]:
scGRO_ssAd = mclapply(
    fpath=ssAd_path,
    ssAd_exp,
    suffix="_merged2pass_dedup.bam",
    mc.cores=15,
    process_BAM
);

Now that we have all of the data in one format, apply post-processing:
1. Merge all experiments via GRangesList and unlist
2. Sort reads.
3. Extract experiment, plate, cell, and UMI barcodes.

In [None]:
scGRO = c(scGRO_PE, scGRO_SE, scGRO_hpAd, scGRO_ssAd) %>%
    GRangesList %>%
    unlist %>%
    sort();

In [None]:
scGRO$Exp = scGRO$ExpID %>%
    strsplit("_", fixed=T) %>%
    sapply('[', 1) %>%
    factor;

scGRO$Plate = scGRO$ExpID %>%
    strsplit("_", fixed=T) %>%
    sapply('[', 2) %>%
    factor;

In [None]:
scGRO$Cell = scGRO$BC_UMI %>%
    strsplit("_", fixed=T) %>%
    sapply('[', 1) %>% 
    factor;

scGRO$UMI = scGRO$BC_UMI %>%
    strsplit("_", fixed=T) %>%
    sapply('[', 2) %>% 
    factor;

In [None]:
scGRO$BC_UMI = NULL;
scGRO$ExpID = NULL;

In [None]:
object.size(scGRO) %>%
    format(units = "auto")

scGRO

saveRDS(scGRO, file="../data/scGROv2p8_consolidated.rds");