## Refine groHMM transcription units overlapping candidate enhancers (gene-distal dREG peaks)

In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressPackageStartupMessages({
    library(rtracklayer)
    library(plyranges)
    library(tidyverse)
    library(foreach)
});

In [None]:
options(
    repr.plot.width=3,
    repr.plot.height=3,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic());

In [None]:
## load genes:
dREGrefingedGenes =          readRDS("../data/dREG_refinedGenes_mES_mm10.rds");
names(dREGrefingedGenes)=NULL;
dREGrefingedGenes
groHMMrefingedGenes = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
names(groHMMrefingedGenes)=NULL;
groHMMrefingedGenes

In [None]:
inactivegenes = groHMMrefingedGenes  %>%
    filter( !dREG ) %>%
    select( name );
inactivegenes

activegenes = groHMMrefingedGenes  %>%
    filter( dREG ) %>%
    select( name );
activegenes

summary(width(activegenes))
summary(width(inactivegenes))

# # specify maximum gene length to be used for removing overlapping dREG enhancers
# # No need to remove dREG after 30kb since we dont use gene signal after 30kb
# geneLength = 30000;
# truncgenes = activegenes;

# longf = which( width(truncgenes) > geneLength );
# truncgenes[longf] = truncgenes[longf] %>%
#     resize( width = geneLength, fix="start" );
# summary(width(truncgenes))

In [None]:
# load raw HMM calls
raw_HMM = read_bed(file="../data/groHMM_mES_BRsComb_LP-50_UTS10.bed");
raw_HMM$score=NULL;
raw_HMM$name=NULL;
summary(width(raw_HMM))

# trim long raw HMM calls to max of 10 kb
# removes unnecessary overlap with dREG
# results in longest stitched dREG to 5 kb
longf = which( width(raw_HMM) >= 5000 );
raw_HMM[longf] = raw_HMM[longf] %>%
     resize( width = 5000, fix="start" );
summary(width(raw_HMM))
raw_HMM

In [None]:
# load dREG peak calls and convert to GRanges
# this file is generated by calling dREG (Charles Danko Lab, Cornell) on bulk PROseq data, using the server: https://dreg.dnasequence.org/
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
colnames(dREG) = c("chr", "start", "end", "score", "pval", "center");
dREG = GRanges(dREG)
dREG

In [None]:
length(raw_HMM);
length(dREG);

# remove enhancers within 500bp of active genes
# (invert=T: keep regions that DO NOT overlap active genes)
HMMenh  = subsetByOverlaps( raw_HMM, activegenes, invert=T, ignore.strand=T, maxgap = 1500 );
dREGenh = subsetByOverlaps(    dREG, activegenes, invert=T, ignore.strand=T, maxgap = 1500 );
length(HMMenh);
length(dREGenh);

# DEACTIVATING THIS FOR NOW
# # add enhancers on non-gene strand supported by HMM and dREG
# HMM_nts = truncgenes;
# strand(HMM_nts) = ifelse( strand(truncgenes) == "+", "-", "+" );
# geneenh = subsetByOverlaps( raw_HMM, dREG, ignore.strand=T, maxgap=500 ) %>%
#     subsetByOverlaps( HMM_nts );

# HMMenh = append(HMMenh, geneenh);

# length(geneenh);
# length(HMMenh);

In [None]:
# get distance from each enhancer to nearest active gene
enh_dist = as.data.frame( distanceToNearest( dREGenh, activegenes, ignore.strand=T ) );

ggplot( enh_dist, aes(x=distance)) +
    geom_histogram(binwidth=500) +
    ggtitle("Enhancer-gene distance") +
    xlim(0, 5E4) +
    # ylim(0, 600) +
    xlab("Distance to nearest active gene") +
    ylab("Number of enhancers");

In [None]:
hits = findOverlaps( dREGenh, HMMenh, maxgap=500 );
length(unique(hits@from))
length(unique(hits@to))
# inspect examples of groHMM/dreg enhancers that aren't in agreement

In [None]:
# this block was not here before, I added it:
HMMenh = unique(HMMenh[hits@to])
summary(width(HMMenh))
summary(width(dREGenh))
HMMenh

In [None]:
# resize dREG-specific enhancers to average of HMMenh,
# append all HMMenh and merge,
# filter against genes again after extension,
# add enhancer names

## In this version, not requiring dREGenh to not overlap HMMenh
enhancers = subsetByOverlaps(dREGenh, HMMenh, invert=T, ignore.strand=T, maxgap=500) %>%
    resize( width=3000, fix="center" ) %>%
    append( HMMenh ) %>%
    mutate( strand="*" ) %>%
    reduce_ranges() %>%
    subsetByOverlaps( activegenes, invert=T, ignore.strand=T ) %>%
    filter( seqnames != "chrM" );
summary(width(enhancers));
enhancers

In [None]:
# resize short enhancers to 3kbp
short_enh = enhancers %>%
    filter( width < 3000 ) %>%
    resize( width=3000, fix="center" );

# merge overlaps and add back into set
enhancers = enhancers %>%
    subsetByOverlaps( short_enh, invert=T ) %>%
    append( short_enh ) %>%
    reduce_ranges() %>%
    mutate( name=paste0(seqnames, "-", start) );
enhancers

In [None]:
features = append(groHMMrefingedGenes, enhancers) %>%
    sort();
features
write_bed(features, file="../data/groHMM_dREG_refinedFeatures_mES_mm10.bed");

In [None]:
features = append(groHMMrefingedGenes, enhancers) %>%
    sort();
features
write_bed(features, file="../data/groHMM_dREG_refinedFeatures_mES_mm10.bed");

dREGfeatures = append(dREGrefingedGenes, enhancers) %>%
    sort();
dREGfeatures
write_bed(dREGfeatures,    file="../data/dREG_refinedFeatures_mES_mm10.bed");

In [None]:
length(enhancers)

In [None]:
enh_size = data.frame( w=width(enhancers) );

ggplot( enh_size, aes(x=w)) +
    geom_histogram(binwidth=100) +
    ggtitle("Enhancer widths") +
    xlim(0, 7000) +
    xlab("Width") +
    ylab("Number of enhancers");

In [None]:
# estimate % of genome for transcribed enhancers
sum(width(enhancers))/3E7

In [None]:
# check for any overlaps among features
# ensure no overlaps to avoid double-counting reads
hits = findOverlaps( features, features );
ovpairs = hits@from != hits@to;
sum( ovpairs );
features[ hits@from[ ovpairs ] ];
features[ hits@to[ ovpairs ] ];

## THE OVERLAP IS ONLY WITH INACTIVE GENES. WITHOUT INCACTIVE GENES, THERE ARE NO OVERLAPPING FEATURES.
summary(features[ hits@from[ ovpairs ] ]$dREG)
summary(features[ hits@to[ ovpairs ] ]$dREG)

In [None]:
test = features %>%
    filter(substr(name, 0, 3) == 'GN-');
summary(width(test));
test