## Optimize groHMM parameters on bulk PRO-seq data to define transcription units
groHMM paper: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0656-3

In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressPackageStartupMessages({
    library(rtracklayer)
    library(groHMM)
    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    library(org.Mm.eg.db)
    library(plyranges)
    library(tidyverse)
    library(foreach)
    library(GenomicFeatures)
});
options(mc.cores=getCores(16))

In [None]:
## groHMM throws an error if there are seqnames containing no alignments.
## Therefore, focus on the standard chromosomes.
chroms = paste0("chr", c(1:19, "X", "Y", "M"));

### make consensus annotations
# rgdb = makeTxDbFromUCSC(genome="mm10", tablename="refGene")
rgdb = makeTxDbFromUCSC("mm10","refGene")
ref_transcripts = transcripts(
            rgdb,
            columns=c("gene_id", "tx_id", "tx_name")
        ) %>%
    filter(seqnames %in% chroms);
seqlevels(ref_transcripts) = seqlevelsInUse(ref_transcripts);

In [None]:
rgConsensus = makeConsensusAnnotations(
    ref_transcripts,
    keytype="gene_id",
    mc.cores=getOption("mc.cores")
);

In [None]:
#using PROseq data:

infile = "/net/bmc-lab2/data/lab/scgroseq/group/mES_PROseq_RNAseq_ATACseq/PROseq_mES/PROseq_mES_BRsComb_merged/PROseq_mES_BRsComb_dedup.bam";

seqlib = read_bam( infile ) %>%
        filter( seqnames %in% chroms & mapq > 1 ) %>%
        GRanges();

strand(seqlib) = ifelse( strand(seqlib) == "+", "-", "+" );
seqlevels(seqlib) = seqlevelsInUse(seqlib);

reads = seqlib %>%
    resize(width=1, fix="end") %>%
    sort();
saveRDS(reads, file = "../group/notebooks/data/PROseq_mES_BRsComb_bulk.Rds")

In [None]:
# see groHMM paper for description of these parameters
tuneParams = data.frame(
    LP  = rep(c(-50,-100,-200,-400), each=3),
    UTS = rep(c(  5,  10,  15     ), 4)
);
tuneParams

In [None]:
# run groHMM with each parameter combination
evals = foreach( x = 1:12, .combine="rbind" ) %do% {
    LP  = tuneParams$LP[x];
    UTS = tuneParams$UTS[x];
    
    hmm = detectTranscripts(
                reads,
                LtProbB = LP,
                UTS = UTS
            );
    
    write_bed(hmm$transcripts, file=paste0("../data/groHMM_mES_BRsComb_LP", LP, "_UTS", UTS, ".bed"));
    
    e = evaluateHMMInAnnotations(hmm$transcripts, rgConsensus);
    return(e$eval);
}
evals = cbind(tuneParams, evals);

In [None]:
# summary of results
evals