## Refine groHMM transcription units overlapping RefSeq genes

In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(rtracklayer)
    library(groHMM)
    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    library(org.Mm.eg.db)
    library(plyranges)
    library(tidyverse)
});

In [None]:
options(
    mc.cores=getCores(8),
    repr.plot.width=3,
    repr.plot.height=3,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic());

In [None]:
# see http://genome.ucsc.edu/FAQ/FAQformat#format9
header_names = c(
    "ID", "tx_name", "chr", "strand", "start", "end",
    "cdsStart", "cdsEnd", "Exons", "xStarts", "xEnds",
    "score", "name", "startStat", "endStat", "xFrames"
);
#ref_transcripts = read_delim("http://s3.amazonaws.com/igv.org.genomes/mm10/ncbiRefSeq.sorted.txt.gz",
#ref_transcripts = read_delim("http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/refGene.txt.gz",
ref_transcripts = read_delim("/net/bmc-lab2/data/lab/scgroseq/group/genes_enhancer_list/ncbiRefSeq.sorted.txt.gz",
                            col_names = header_names, show_col_types=F,
                            delim = "\t") %>%
    mutate(xStarts=NULL, xEnds=NULL, xFrames=NULL) %>%
    mutate(name=paste0("GN-", name)) %>%
    filter(nchar(chr) <= 5) %>%
    GRanges;
names(ref_transcripts) = ref_transcripts$name;

In [None]:
length(ref_transcripts)

ref_mrna = ref_transcripts %>%
    filter( substr(tx_name, 0, 2) %in% c("NM", "XM")  );

ref_ncrna = ref_transcripts %>%
    filter( !substr(tx_name, 0, 2) %in% c("NM", "XM") );

# remove ncRNA overlapping mRNA genes to avoid mapping issues
ref_ncrna = subsetByOverlaps( ref_ncrna, ref_mrna, invert=T );
ref_transcripts = c(ref_mrna, ref_ncrna);

length(ref_transcripts)

In [None]:
ref_mrna = ref_mrna %>%
    group_by(name) %>%
    reduce_ranges_directed();
names(ref_mrna) = ref_mrna$name;
ref_mrna
#saveRDS(ref_mrna, "/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/UCSC_mm10_ncbiRefSeq_mRNA.rds");

In [None]:
# discard empty chromosomes
seqlevels(ref_transcripts) = seqlevelsInUse(ref_transcripts);

# group genes by ID and reduce by strand-specific overlaps
# this provides the full list of genes, regardless of thier dREG stauts
# will be used to identify missed genes to add back to the glist
rgConsensus = ref_mrna;
names(rgConsensus) = rgConsensus$name;

In [None]:
# load dREG peak calls and convert to GRanges
# this file is generated by calling dREG (Charles Danko Lab, Cornell) on bulk PROseq data, using the server: https://dreg.dnasequence.org/
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
colnames(dREG) = c("chr", "start", "end", "score", "pval", "center");
dREG = GRanges(dREG)
dREG

In [None]:
# define promoter regions from refseq genes +- 500 bp
refseq_promoters = promoters( ref_transcripts, upstream = 500, downstream = 500 );
active_promoters = findOverlaps( refseq_promoters, dREG );

# overlap with dREG to identify which promoters are active
hasActivePro = queryHits(active_promoters) %>% unique();
active_isoforms = ref_transcripts[ hasActivePro ];
active_isoforms;

In [None]:
# use PROseq data to define active transcription units
PROseq = readRDS("/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/PROv2p8_consolidated.rds");
PROseq = PROseq %>%
    filter(mapq >= 3 & miRQC & umiQC) %>%
    resize(width=1, fix="end") %>%
    subsetByOverlaps(active_isoforms);
#PROseq

active_isoforms$pol_density = countOverlaps(active_isoforms, PROseq) / width(active_isoforms);
#hist(log(active_isoforms$pol_density))

active_genes = active_isoforms %>%
    arrange(desc(pol_density)) %>%
    filter(!duplicated(name)) %>%
    mutate(dREG=T);
active_genes

In [None]:
# re-add inactive genes
missed_genes = subsetByOverlaps( rgConsensus, active_genes, invert=T );
missed_genes$dREG=F;

all_genes = c(active_genes, missed_genes) %>%
    filter(!duplicated(name));
length(all_genes)
names(all_genes) = all_genes$name;
all_genes

In [None]:
# Prioritize largest genes, then remove smaller overlaps.
filtered_genes = all_genes[ order(-width(all_genes)) ]
hits = findOverlaps( filtered_genes, filtered_genes );
# after sorting by size, earlier index = larger gene
hits = hits[ hits@from <  hits@to ];
# remove overlapping regions with later index = smaller gene
filtered_genes = filtered_genes[-unique(hits@to)] %>%
    sort() %>%
    select(c(dREG, name));

names(filtered_genes) = filtered_genes$name;
summary(filtered_genes$dREG)
filtered_genes

## Refine groHMM using active genes
1. Find overlaps between groHMM and active genes
2. Split groHMM TUs with 3 kbp gap if they "read through" into a second active gene
3. Extend genes using refined groHMM

In [None]:
# load optimized groHMM transcription unit calls
HMM_transcripts = read_bed(file="/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/groHMM_mES_BRsComb_LP-50_UTS10.bed");

In [None]:
getGeneID = function( regions, genes ) {
    hits = findOverlaps( regions, genes );
    geneIDs = rep(NA, length(regions));
    geneIDs[hits@from] = genes$name[ hits@to ];
    return(geneIDs);
}

HMM_transcripts$name = getGeneID( HMM_transcripts, filtered_genes );

In [None]:
hits = findOverlaps( HMM_transcripts, filtered_genes );
# check frequency distribution of overlaps
numHits = table(queryHits(hits));
numHits = as.data.frame( numHits, stringsAsFactors = F );
colnames(numHits) = c("index", "count");
numHits$index = as.integer(numHits$index);
head(numHits);

In [None]:
ggplot( numHits, aes(x=count)) +
    geom_histogram(binwidth=1) +
    xlim(0, 8) +
    ggtitle("Genes per groHMM unit") +
    xlab("Number of genes") +
    ylab("Number of units");

In [None]:
# create 100 bp window upstream of active reference genes
activeGpro = subsetByOverlaps( filtered_genes, HMM_transcripts ) %>%
    promoters(upstream=100, downstream=0);

# subtract promoters from TUs = split TUs between genes
HMM_transcripts = subtract( HMM_transcripts, activeGpro ) %>% unlist();

# update gene assignments
HMM_transcripts$name = getGeneID( HMM_transcripts, filtered_genes );

In [None]:
# Group/merge HMMs overlapping the same gene.
HMM_transcripts = HMM_transcripts %>%
    as.data.frame %>%
    filter( !is.na(name) ) %>%
    group_by(name) %>%
    mutate( start=min(start), end=max(end) ) %>%
    ungroup() %>%
    filter(!duplicated(name)) %>%
    GRanges;


In [None]:
plUnits = HMM_transcripts[ strand(HMM_transcripts) == "+" ];
mnUnits = HMM_transcripts[ strand(HMM_transcripts) == "-" ];

# set gene 5' end according to reference annotation
# plus strand 5' = start coordinate
plUnits = data.frame(
        chr = seqnames(plUnits),
#        start = start(ref_transcripts[ as.character(plUnits$name) ]),
        start = start(filtered_genes[ as.character(plUnits$name) ]),
        end = end(plUnits),
        strand="+",
        name=plUnits$name
    ) %>%
        filter( end > start ) %>%
        GRanges();

In [None]:
# minus strand 5' = end coordinate
mnUnits = data.frame(
        chr = seqnames(mnUnits),
        start = start(mnUnits),
#        end = end(ref_transcripts[ as.character(mnUnits$name) ]),
        end = end(filtered_genes[ as.character(mnUnits$name) ]),
        strand="-",
        name=mnUnits$name
    ) %>%
        filter( end > start ) %>%
        GRanges();

refinedUnits = c( plUnits, mnUnits );

In [None]:
# merge overlapping genes with same name
refinedUnits = refinedUnits %>%
    group_by(name) %>%
    reduce_ranges_directed() %>%
    filter(!duplicated(name)) %>%
    sort();

refinedUnits

In [None]:
# reset any genes shortened by HMM
xgene = refinedUnits %>% filter( name %in% rgConsensus$name );
#xrefg = rgConsensus[ as.character(xgene$name) ];
xrefg = filtered_genes[ as.character(xgene$name) ];
shortg = xgene[ width(xrefg) > width(xgene) ]$name;
refinedUnits = refinedUnits %>%
    filter( ! name %in% shortg ) %>%
    append( xrefg[ as.character(shortg) ] ) %>%
    sort;

In [None]:
# reset any genes missed by HMM
missed_genes = subsetByOverlaps( rgConsensus, refinedUnits, invert=T );
refinedUnits = c(refinedUnits, missed_genes);

In [None]:
# check for any overlaps among features and remove smaller
refinedUnits = refinedUnits[order(-width(refinedUnits))];
hits = findOverlaps( refinedUnits, refinedUnits );
hits = hits[ hits@from < hits@to ];
refinedUnits[ hits@from ]$name
refinedUnits[ hits@to ]$name
refinedUnits = refinedUnits[ -unique(hits@to) ] %>%
    sort;

In [None]:
names(refinedUnits) = refinedUnits$name;

refinedUnits  = refinedUnits %>%
    filter( names %in% names(filtered_genes) ) %>%
    filter( !duplicated(names) ) %>%
    mutate( dREG = filtered_genes[names]$dREG );
refinedUnits

write_bed(refinedUnits, file="../data/groHMM_dREG_refinedGenes_mES_mm10.bed");
write_rds(refinedUnits, file="../data/groHMM_dREG_refinedGenes_mES_mm10.rds");

In [None]:
e = evaluateHMMInAnnotations(refinedUnits, filtered_genes)
e$eval

In [None]:
# match reference genes to refined units
sum(refinedUnits$name %in% filtered_genes$name);
sum(filtered_genes$name %in% refinedUnits$name);

In [None]:
#xgene = refinedUnits %>% filter( gene %in% rgConsensus$name );
#xrefg = rgConsensus[ as.character(xgene$name) ];
xgene = refinedUnits %>% filter( name %in% filtered_genes$name );
xrefg = filtered_genes[ as.character(xgene$name) ];

gstats = data.frame(
    oldw = width(xrefg),
    neww = width(xgene),
    deltaw = width(xgene) - width(xrefg)
);

In [None]:
ggplot( gstats, aes(x=oldw) ) + 
    geom_histogram(breaks=(0:50)*1000) +
    ylim(0, 1500) +
    ggtitle("Reference gene widths") +
    xlab("width");

In [None]:
ggplot( gstats, aes(x=neww) ) + 
    geom_histogram(breaks=(0:50)*1000) +
    ylim(0, 1500) +
    ggtitle("Refined gene widths") +
    xlab("width");

In [None]:
ggplot( gstats, aes(x=deltaw) ) + 
    geom_histogram(breaks=(0:30)*1000) +
    ggtitle("Change in gene widths") +
    ylim(0, 1500) +
    xlab("width");

In [None]:
# gene list of extensions >20kbp
xgene[ gstats$deltaw > 20000 ]

In [None]:
# % of all consensus genes
100 * length(refinedUnits) / length(filtered_genes)
length(refinedUnits)
length(filtered_genes)
# limit to protein-coding genes

In [None]:
# % of genome that is transcribed
txn = refinedUnits %>% reduce_ranges_directed();
100 * sum(width(txn)) / 5.9E9

### Save dREG filtered genes
Same length and names as the groHMM + dREG filtered genes

In [None]:
filtered_genes = filtered_genes %>%
    filter( name %in% names(refinedUnits) );
filtered_genes

write_rds(filtered_genes, file="../data/dREG_refinedGenes_mES_mm10.rds");
write_bed(filtered_genes, file="../data/dREG_refinedGenes_mES_mm10.bed");