In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(ggExtra)
    library(matrixStats)
    library(plyranges)
    library(viridis)
    library(data.table)
    library(ggridges)
    library(ggbeeswarm)
    library(ggpointdensity)
    library(doParallel)
});
source("./scGRO_functions.r");

In [None]:
options(
    repr.plot.width=4,
    repr.plot.height=4,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# Get equation and r^2 as string
# https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA

lm_eqn = function(x, y) {
    m = lm(y ~ x);
    eq = substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 2)))
    as.character(as.expression(eq));
}

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
unexpressed = genesWithdREGstatus %>% 
    filter(dREG == F);
# unexpressed

In [None]:
# Specify feature length (half at the start and other half at the end)
# to be trimmed to eliminate the effect of paused Pol II at TSS and TES
trimEndLength =  1000;
# specify maximum gene length to be used for G-E correlation:
maxGeneLengthAfterTrim = 10000;

In [None]:
# load genes
features = read_bed("../data/dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
features$name=NULL;
features$score=NULL;
length(features)

genes = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    # filter( names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
    #     !names %in% names(genesWithdREGstatus) ) %>%
    # removing features from chrM - resizing chrM dREG results in negative granges:
    filter( seqnames != "chrM" ) %>%
    # filter genes for minimum length:
    filter( width >= maxGeneLengthAfterTrim + trimEndLength ) %>%
    anchor_center() %>%
    mutate(width = width - trimEndLength);
# truncate long genes
longf = which( width(genes) > maxGeneLengthAfterTrim );
genes[longf] = genes[longf] %>%
    resize( width = maxGeneLengthAfterTrim, fix="start" );
summary(width(genes))
genes 

In [None]:
# load groHMM-extended genes and enhancers
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
colnames(dREG) = c("chr", "leftEnd", "rightEnd", "score", "pval", "center");
dREG = dREG %>%
    filter(chr != "chrM") %>%
    mutate(start = center - 1500) %>%
    mutate(end = center + 1500) %>%
    ## removing dREG peaks with score lower than the 1st quantile
    # filter( score > summary(score)[2] );
    GRanges() %>%
    # filter out dREG peaks that overlap with genes
    subsetByOverlaps( features[ substr(names(features), 0, 3) == "GN-" ], invert = TRUE );

names(dREG) = paste0(dREG@seqnames, "-", dREG$center)
dREG$leftEnd = NULL;
dREG$rightEnd = NULL;
dREG$pval = NULL;

# mark overlapping dREG peaks:
hits = findOverlaps( dREG );
dREG$overlappingEnh = NA;
# assign ovelapping dREG peaks
dREG$overlappingEnh[hits@from] = names(dREG)[hits@to];
# filter peak to only leave one among overlapping enhancers with highest score:
dREG = dREG %>%
    # group overlapping enhancers and keep one with max score
    group_by( overlappingEnh ) %>%
    filter( score == max(score) ) %>%
    ungroup();
dREG

# make dREG anti-sense
dREGantis = dREG %>%
    # removing the 250 nt at the center as paused peak
    mutate(end = center - 251) %>%
    mutate(strand = "-") 
# names(dREGantis) = paste0(names(dREGantis), "_as")

# make dREGsense
dREGsense = dREG %>%
    # removing the 250 nt at the center as paused peak
    mutate(start = center + 251) %>%
    mutate(strand = "+")
# names(dREGsense) = paste0(names(dREGsense), "_s")

enhancers = c(dREGantis, dREGsense) %>%
    sort();
summary(width(enhancers))
enhancers

In [None]:
features = c(genes, enhancers)

In [None]:
counts = readRDS( "../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds" );
dim(counts)
hist(colSums(counts))

In [None]:
# # filter cells for a tighter range of expression
# counts = counts[, colSums(counts) > 500]
# dim(counts)

In [None]:
# plot mean vs variance
data.frame( cmean=rowMeans(counts), cvar=apply(counts, 1, var) ) %>%
    ggplot(aes(x=cmean, y=cvar)) +
    geom_pointdensity(size=0.5) +
    scale_color_viridis_c() +
    ggtitle("scGROseq counts") +
    # scale_x_log10() + 
    # scale_y_log10() +
    xlab("Mean") +
    ylab("Variance") +
    xlim(0, 1) +
    ylim(0, 1)

In [None]:
# fraction of reads mapping to unexpressed genes
dim(counts[rownames(counts) %in% names(unexpressed),])
sum(rowSums(counts[rownames(counts) %in% names(unexpressed),]))/sum(sum(counts))*100

In [None]:
scGRO  = readRDS("../data/scGROv2p8_consolidated.rds");
scGRO

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( umiQC & plateQC & cellQC & countQC & miRQC ) %>%
    mutate(cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    # filter( !(Exp == "Exp236" & Plate %in% paste0("c0", 5:8)) ) %>%
    # filter( !(Exp == "Exp260b") ) %>%
    resize(width=1, fix="end") %>%
    select(cellID) %>%
    filter(cellID %in% colnames(counts)) %>%
    subsetByOverlaps(features);
names(scGRO) = NULL;
scGRO

In [None]:
scGRO$cellID = droplevels(scGRO$cellID);

# mark features for each read
hits = findOverlaps( scGRO, features );
scGRO$feature = NA;
scGRO$feature[hits@from] = names(features)[hits@to];
scGRO

In [None]:
length(unique(scGRO$feature))

In [None]:
# estimate burst rate genome-wide
#burst_window_size = 30000;
pol_ii_speed = 2500*60; # per hour
total_cells=n_distinct(scGRO$cellID)
nalleles=2*total_cells;
# not true capture rate of scGRO, but for regions of genome used in this analysis
# Most of the drop outs are paused Pol II. although the capture efficiency looks like 5% based on transcribing Pol II, the gene body capture efficiency is likely 10%
capture_efficiency = 0.10;
gene_txn_time = width(features)/pol_ii_speed; # number of hours to transcribe this feature
names(gene_txn_time) = names(features);

In [None]:
bursts_10k = scGRO %>%
    #mutate( burstID = paste(cellID, floor(start/burst_window_size)) ) %>%
    as.data.table %>%
    select(-end, -width) %>%
    group_by(feature) %>%
    summarise( N_reads = n(), N_bursts=n_distinct(cellID) ) %>% #, N_bursts=n_distinct(burstID) ) %>%
    mutate( chr = as.character(seqnames(features[feature])) ) %>%
    mutate( start = start(features[feature]), end = end(features[feature]) ) %>%
    # Enhancers had two 2500 window in sense and antisense. Total length is 2*2500
    mutate( length = ifelse( substr(feature, 0, 3) == "GN-", width(features[feature]), 2*width(features[feature])) ) %>%
    mutate( meanExpPerKb = N_reads/total_cells/length*1000 ) %>%
    mutate( burst_size = N_reads/N_bursts ) %>%
    # adjust for capture efficiency
    mutate( burst_size = 1+(burst_size-1)/capture_efficiency ) %>%
   # mutate( burst_rate = N_bursts/gene_txn_time[feature]/nalleles ) %>%
    mutate( burst_rate = N_bursts/gene_txn_time[feature]/ifelse( chr %in% c("chrX", "chrY"), total_cells, 2*total_cells) ) %>%
    mutate( detect_rate = burst_size*capture_efficiency ) %>%
    mutate( burst_rate = burst_rate/ifelse(detect_rate>1, 1, detect_rate) ) %>%
    select(feature, chr, start, end, length, meanExpPerKb, N_reads, N_bursts, burst_rate, burst_size );
dim(bursts_10k)

In [None]:
# restrict to first 5kbp of features
features_5k = features;
longf = width(features_5k) >= 5000;

features_5k[longf] = features_5k[longf] %>%
    resize(width=5000, fix="start");

scGRO_5k = scGRO %>%
    subsetByOverlaps(features_5k);

burst_window_size_5k = 5000;
gene_txn_time_5k = width(features_5k)/pol_ii_speed;
names(gene_txn_time_5k) = names(features_5k);

bursts_5k = scGRO_5k %>%
    #mutate( burstID = paste(cellID, floor(start/burst_window_size)) ) %>%
    as.data.table %>%
    select(-end, -width) %>%
    group_by(feature) %>%
    summarise( N_reads_5k=n(), N_bursts_5k=n_distinct(cellID) ) %>% #, N_bursts=n_distinct(burstID) ) %>%
    mutate( chr=as.character(seqnames(features[feature])) ) %>%
    mutate( start = start(features[feature]), end = end(features[feature]) ) %>%
    # Enhancers had two 2500 window in sense and antisense. Total length is 2*2500
    mutate( length = ifelse( substr(feature, 0, 3) == "GN-", width(features[feature]), 2*width(features[feature])) ) %>%
    mutate( meanExpPerKb_5k = N_reads_5k/total_cells/length*1000 ) %>%
    mutate( burst_size_5k = N_reads_5k/N_bursts_5k ) %>%
    # adjust for capture efficiency
    mutate( burst_size_5k = 1+(burst_size_5k-1)/capture_efficiency ) %>%
    # mutate( burst_rate_5k = N_bursts_5k/gene_txn_time[feature]/nalleles ) %>%
    mutate( burst_rate_5k = N_bursts_5k/gene_txn_time[feature]/ifelse( chr %in% c("chrX", "chrY"), total_cells, 2*total_cells) ) %>%
    mutate( detect_rate   = burst_size_5k*capture_efficiency ) %>%
    mutate( burst_rate_5k = burst_rate_5k/ifelse(detect_rate>1, 1, detect_rate) ) %>%
    select(feature, N_reads_5k, N_bursts_5k, burst_rate_5k, burst_size_5k );
dim(bursts_5k)

In [None]:
#join 10k and 5K bursting kinetics:
bursts = left_join( bursts_10k, bursts_5k, by="feature" ) %>%
    mutate( Type = ifelse(substr(feature, 0, 3) == "GN-", "Gene", "Enhancer") ) %>%
    mutate( Type = factor(Type, levels = c("Gene", "Enhancer")) ) %>%
    na.omit() %>% # without = 24083
    arrange(desc(N_bursts));
dim(bursts)
head(bursts)

fwrite(bursts, file="../data/scGROv2p8_max10kbp_max5kbp_burst_rate_0p10_captureEfficiency.csv");

In [None]:
summaryTable = bursts %>%
    mutate( Type = ifelse(substr(feature, 0, 2) == "GN", "Gene", "Enhancer") ) %>%
    mutate( Type = factor(Type, levels = c("Gene", "Enhancer")) );

# summary of burst frequency:
br10 = summaryTable %>% group_by(Type) %>%
    summarise(min(burst_rate), median(burst_rate), mean(burst_rate), max(burst_rate))
#summary of time between bursts:
bi10 = summaryTable %>% group_by(Type) %>%
    summarise(min(1/burst_rate), median(1/burst_rate), mean(1/burst_rate), max(1/burst_rate))
# summary of burst size:
bs10 = summaryTable %>% group_by(Type) %>%
    summarise(min(burst_size), median(burst_size), mean(burst_size), max(burst_size))

br5 = summaryTable %>% group_by(Type) %>%
    summarise(min(burst_rate_5k), median(burst_rate_5k), mean(burst_rate_5k), max(burst_rate_5k))
bi5 = summaryTable %>% group_by(Type) %>%
    summarise(min(1/burst_rate_5k), median(1/burst_rate_5k), mean(1/burst_rate_5k), max(1/burst_rate_5k))
bs5 = summaryTable %>% group_by(Type) %>%
    summarise(min(burst_size_5k), median(burst_size_5k), mean(burst_size_5k), max(burst_size_5k))

brAll = rbind(br10, setNames(bi10, names(br10)), setNames(bs10, names(br10)), 
              setNames(br5, names(br10)), setNames(bi5, names(br10)), setNames(bs5, names(br10)))
brAll
fwrite(brAll, file="../data/scGROv2p8_burst_kinetics_0p10_captureEfficiency_burstWindow.csv");

##### capture efficiency = 0.1 
## 10 kb truncated:
Type	min(burst_rate)	median(burst_rate)	mean(burst_rate)	max(burst_rate)
Gene	0.009491268	      1.0208386	            1.278291	       18.36560
Enhancer	0.011389522	  0.6833713	            1.180078	       83.99151

Type	min(1/burst_rate)	median(1/burst_rate)	mean(1/burst_rate)	max(1/burst_rate)
Gene	   0.05444961	         0.9795868	             2.312488	        105.36
Enhancer   0.01190597	         1.4633333	             1.978099	        87.80

Type	min(burst_size)	median(burst_size)	mean(burst_size)	max(burst_size)
Gene	      1	              1	                 1.231106	        6.033829
Enhancer	  1	              1	                 1.044373	        70.950363
## 5 kb truncated:
Type	min(burst_rate_5k)	median(burst_rate_5k)	mean(burst_rate_5k)	max(burst_rate_5k)
Gene	    0.00284738	         0.5694761	             0.7902558	         18.33713
Enhancer	0.01138952	         0.6833713	             1.1800781	         83.99151
A tibble: 2 × 5
Type	min(1/burst_rate_5k)	median(1/burst_rate_5k)	mean(1/burst_rate_5k)	max(1/burst_rate_5k)
Gene	     0.05453416	                1.756000	          4.123305	               351.2
Enhancer	 0.01190597	                1.463333	          1.978099	               87.8
A tibble: 2 × 5
Type	min(burst_size_5k)	median(burst_size_5k)	mean(burst_size_5k)	max(burst_size_5k)
Gene	         1	                 1	                    1.120727	      11.00000
Enhancer	     1	                 1	                    1.044373	      70.95036


##### capture efficiency = 0.05
## 10 kb truncated:
Type	min(burst_rate)	median(burst_rate)	mean(burst_rate)	max(burst_rate)
Gene	    0.01035411	    1.825775	       2.151973	           36.73121
Enhancer	0.01138952	    1.366743	       2.266185	           141.41944

Type	min(1/burst_rate)	median(1/burst_rate)	mean(1/burst_rate)	max(1/burst_rate)
Gene	   0.027224806	         0.5477127	             1.116992	         96.58
Enhancer   0.007071164	         0.7316667	             1.011312	         87.80

Type	min(burst_size)	median(burst_size)	mean(burst_size)	max(burst_size)
Gene	       1	           1	               1.468401	        11.06766
Enhancer	   1	           1	               1.088745	        140.90073
## 5 kb truncated:
Type	min(burst_rate_5k)	median(burst_rate_5k)	mean(burst_rate_5k)	max(burst_rate_5k)
Gene	    0.00284738	           1.082005	               1.416489	        36.67426
Enhancer	0.01138952	           1.366743	               2.266185	        141.41944

Type	min(1/burst_rate_5k)	median(1/burst_rate_5k)	mean(1/burst_rate_5k)	max(1/burst_rate_5k)
Gene	     0.027267081	            0.9242105	           2.144820	              351.2
Enhancer	 0.007071164	            0.7316667	           1.011312	              87.8

Type	min(burst_size_5k)	median(burst_size_5k)	mean(burst_size_5k)	max(burst_size_5k)
Gene	         1	                 1	                    1.241454	      21.0000
Enhancer	     1	                 1	                    1.088745	      140.9007

In [None]:
bkg_bursts = bursts %>%
    filter(feature %in% unexpressed$name) %>%
    arrange(-burst_rate)
# head(bkg_bursts)

In [None]:
# get the burst size and burst frequence ranked genes for GSEA:
rankedList = bursts %>%
    filter( substr(feature, 0, 3) == "GN-" ) %>%
    mutate( name = sub("GN-", "", feature) ) %>%
    arrange(desc(burst_rate));
dim(rankedList)
rankedList[1:2, ]
out = data.frame(
    Gene = rankedList$name)
    # BurstFreq = test$burst_rate );
out[1:10,]
fwrite(out, file="../data/BurstFrequency_rankedList.txt", sep='\t', col.names=F);

rankedList = bursts %>%
    filter( substr(feature, 0, 3) == "GN-" ) %>%
    mutate( name = sub("GN-", "", feature) ) %>%
    arrange(desc(burst_size));
dim(rankedList)
rankedList[1:2, ]
out = data.frame(
    Gene = rankedList$name)
    # BurstFreq = test$burst_rate );
out[1:10,]
fwrite(out, file="../data/BurstSize_rankedList.txt", sep='\t', col.names=F);

In [None]:
# Burst size before accounting for capture efficiency:
bursts %>%
    ggplot(aes(x=N_reads/N_bursts)) +
    geom_histogram(binwidth=0.01) +
    scale_y_log10() +
    xlim(0.99, 1.5) +
    xlab("Average reads/ feature / active cell") +
    ylab("Number of features")

# Burst size after accounting for capture efficiency:
bursts %>%
    ggplot(aes(x=burst_size)) +
    geom_histogram(binwidth=0.25) +
    scale_y_log10() +
    xlim(0, 10) +
    xlab("Estimated burst size") +
    ylab("Number of features")

In [None]:
bs = bursts %>%
    filter(Type == "Gene") %>%
    ggplot(aes(x = burst_size)) +
    geom_histogram(bins = 30, color = "white", fill = "#39568C") +
    xlim(0, 7.5) +
    scale_y_log10() +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst size") +
    ylab("Number of features") 
bs
ggsave(bs, filename = "../plots/Burst_size_Genes.pdf", width=4, height=4);

bs = bursts %>%
    ggplot(aes(x = burst_size, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    xlim(0, 7.5) +
    scale_y_log10() +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst size") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
bs
ggsave(bs, filename = "../plots/Burst_size_Genes+Enhancers_facet.pdf", width=8, height=4);

bs = bursts %>%
    ggplot(aes(x = burst_size, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    xlim(0, 7.5) +
    scale_y_log10() +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst size") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type, ncol = 1) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
bs
ggsave(bs, filename = "../plots/Burst_size_Genes+Enhancers_facet_vertical.pdf", width=4, height=8);

bs = bursts %>%
    ggplot(aes(x = burst_size, fill = Type)) +
    geom_histogram(bins = 16, position = "identity", alpha = 0.5) + # 
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.8, 0.8)) +
    xlim(0, 7.5) +
    scale_y_log10() +
    coord_cartesian(ylim = c(1.5, 10000)) +
    xlab("Burst size") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))
bs
ggsave(bs, filename = "../plots/Burst_size_Genes+Enhancers.pdf", width=4, height=4);

bursts %>%
    ggplot(aes(x=burst_size, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    ggtitle("Burst size from 10kb gene body") +
    xlab("Burst size") +
    ylab("Number of features");

ks.test(burst_size ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
br = bursts %>%
    filter(Type == "Gene") %>%
    ggplot(aes(x = burst_rate)) +
    geom_histogram(bins = 30, color = "white", fill = "#39568C") +
    scale_x_log10(limits = c(0.01, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") 
br
ggsave(br, filename = "../plots/Burst_rate_Genes.pdf", width=4, height=4);

br = bursts %>%
    ggplot(aes(x = burst_rate, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    scale_x_log10(limits = c(0.01, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
br
ggsave(br, filename = "../plots/Burst_rate_Genes+Enhancers_facet.pdf", width=8, height=4);

br = bursts %>%
    ggplot(aes(x = burst_rate, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    scale_x_log10(limits = c(0.01, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type, ncol = 1) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
br
ggsave(br, filename = "../plots/Burst_rate_Genes+Enhancers_facet_vertical.pdf", width=4, height=8);

br = bursts %>%
    ggplot(aes(x = burst_rate, fill = Type)) +
    geom_histogram(bins = 16, position = "identity", alpha = 0.5) + # 
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 20)) +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))
br
ggsave(br, filename = "../plots/Burst_rate_Genes+Enhancers.pdf", width=4, height=4);

bursts %>%
    ggplot(aes(x=burst_rate, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 20)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    ggtitle("Bursts frequency") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features");

ks.test(burst_rate ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
tbb = bursts %>%
    filter(Type == "Gene") %>%
    ggplot(aes(x = 1/burst_rate)) +
    geom_histogram(bins = 30, color = "white", fill="#39568C") +
    scale_x_log10(limits = c(0.1, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features")
tbb
ggsave(tbb, filename = "../plots/Time_between_bursts_Genes.pdf", width=4, height=4);

tbb = bursts %>%
    ggplot(aes(x = 1/burst_rate, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    scale_x_log10(limits = c(0.1, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
tbb
ggsave(tbb, filename = "../plots/Time_between_bursts_Genes+Enhancers_facet.pdf", width=8, height=4);

tbb = bursts %>%
    ggplot(aes(x = 1/burst_rate, fill=Type)) +
    geom_histogram(bins = 16, color = "white") +
    scale_x_log10(limits = c(0.1, 20)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type, ncol = 1) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
tbb
ggsave(tbb, filename = "../plots/Time_between_bursts_Genes+Enhancers_facet_vertical.pdf", width=4, height=8);

bursts %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x = 1/burst_rate, fill = Type)) +
    geom_histogram(bins = 16, alpha = 0.5) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.8, 0.8)) +
    scale_x_log10(limits = c(0.1, 20)) +
    # xlim(0, 10) +
    # ggtitle("Time until burst (hour) from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))

bursts %>%
    ggplot(aes(x=1/burst_rate, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    # ggtitle("Time until burst (hour) from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features");

ks.test(1/burst_rate ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
subset = bursts$burst_rate_5k != bursts$burst_rate;
txt = lm_eqn( bursts$burst_rate_5k[subset], bursts$burst_rate[subset] );
dim(bursts)

corr10v5 = bursts %>% filter( subset );
dim(corr10v5)

corr10v5 = bursts %>%
    filter( subset ) %>%
    ggplot( aes(x=burst_rate_5k, y=burst_rate) ) +
    geom_pointdensity() +
    annotate(geom="label", label.size=NA, x=0, y=Inf, hjust=0, vjust=1, label=txt, parse=T, fill=NA) +
    scale_x_log10() +
    scale_y_log10() +
    scale_color_viridis() +
    theme(legend.position = c(0.875, 0.175),   legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank()) +
    xlab("Burst frequency (5 kb burst window)") +
    ylab("Burst frequency (10 kb burst window)")
corr10v5
ggsave(corr10v5, filename = "../plots/Correlation_10kb_vs_5kb.pdf", width=4, height=4);

In [None]:
bs = bursts %>%
    ggplot(aes(x = burst_size_5k, fill=Type)) +
    geom_histogram(bins = 25, color = "white") +
    xlim(0, 7.5) +
    scale_y_log10() +
    theme(legend.position="none") +
    # ggtitle("Bursts from 5kb gene body") +
    xlab("Burst size") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
bs
ggsave(bs, filename = "../plots/Burst_size_5k_Genes+Enhancers_facet.pdf", width=8, height=4);

bs = bursts %>%
    ggplot(aes(x = burst_size_5k, fill = Type)) +
    geom_histogram(bins = 15, position = "identity", alpha = 0.5) + # 
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.8, 0.8)) +
    xlim(0, 7) +
    scale_y_log10() +
    coord_cartesian(ylim = c(1.5, 10000)) +
    # ggtitle("Bursts from 5kb gene body") +
    xlab("Burst size") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))
bs
ggsave(bs, filename = "../plots/Burst_size_5k_Genes+Enhancers.pdf", width=4, height=4);

bursts %>%
    ggplot(aes(x=burst_size_5k, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    # ggtitle("Burst size from 5kb gene body") +
    xlab("Burst size") +
    ylab("Number of features");

ks.test(burst_size_5k ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
br = bursts %>%
    ggplot(aes(x = burst_rate_5k, fill=Type)) +
    geom_histogram(bins = 25, color = "white") +
    scale_x_log10(limits = c(0.01, 10)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    # facet title:
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
br
ggsave(br, filename = "../plots/Burst_rate_5k_Genes+Enhancers_facet.pdf", width=8, height=4);

br = bursts %>%
    ggplot(aes(x = burst_rate_5k, fill = Type)) +
    geom_histogram(bins = 25, position = "identity", alpha = 0.5) + # 
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 20)) +
    # ggtitle("Bursts from 5kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))
br
ggsave(br, filename = "../plots/Burst_rate_5k_Genes+Enhancers.pdf", width=4, height=4);

bursts %>%
    ggplot(aes(x=burst_rate_5k, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    # ggtitle("Bursts from 5kb gene body") +
    xlab("Burst frequency (per hour)") +
    ylab("Number of features");

ks.test(burst_rate_5k ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
tbb = bursts %>%
    ggplot(aes(x = 1/burst_rate_5k, fill=Type)) +
    geom_histogram(bins = 25, position = "identity", color = "white") +
    scale_x_log10(limits = c(0.1, 100)) +
    theme(legend.position="none") +
    # ggtitle("Bursts from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title="")) +
    facet_wrap(~Type) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 0, face="bold"))
tbb
ggsave(tbb, filename = "../plots/Time_between_bursts_5k_Genes+Enhancers_facet.pdf", width=8, height=4);

tbb = bursts %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x = 1/burst_rate_5k, fill = Type)) +
    geom_histogram(bins = 25, position = "identity", alpha = 0.5) +
    scale_fill_manual(values=c("#39568C","#1F968B")) +
    theme(legend.position = c(0.8, 0.8)) +
    scale_x_log10(limits = c(0.1, 100)) +
    # xlim(0, 10) +
    # ggtitle("Time until burst (hour) from 5kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features") +
    guides(fill=guide_legend(title=""))
tbb
ggsave(tbb, filename = "../plots/Time_between_bursts_5k_Genes+Enhancers.pdf", width=4, height=4);

bursts %>%
    ggplot(aes(x=1/burst_rate_5k, color=Type)) +
    theme(legend.position = c(0.2, 0.8)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual(values=c("#39568C","#1F968B")) +
    stat_ecdf(geom="step", linewidth=0.7) +
    # ggtitle("Time until burst (hour) from 10kb gene body") +
    xlab("Time until burst (hour)") +
    ylab("Number of features");

ks.test(1/burst_rate_5k ~ Type, bursts, alternative = "greater", exact=F);

In [None]:
# filter burst sizes of more than 1:
test = bursts %>%
    filter(burst_size > 1)

out = test %>%
    ggplot(aes(x=burst_rate, y=burst_size)) +
    geom_pointdensity() +
    scale_color_viridis() +
    # annotate(geom="label", label.size=NA, x=-Inf, y=Inf, hjust=0, vjust=1, label=txtG, parse=T, fill=NA) +
    # annotate(geom="label", label.size=NA, x=Inf, y=Inf, hjust=0, vjust=1, label=txtE, parse=T, fill=NA) +
    # stat_smooth(method = 'lm') +
    xlim(0, 10) +
    ylim(0, 10) +
    xlab("Burst frequency (per hour)") +
    ylab("Burst size") +
    facet_wrap(~Type) +
    theme(legend.position = c(0.875, 0.75),   legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank(),
          strip.background = element_blank(), strip.text = element_text(size = 14, face="bold"))
out
ggsave(out, filename = "../plots/Burst_size_vs_burst_frequency_facet.pdf", width=8, height=4);

txt = lm_eqn(test[test$Type == "Gene", ]$burst_rate, test[test$Type == "Gene", ]$burst_size)
out = test %>%
    filter( Type == "Gene") %>%
    ggplot(aes(x=burst_rate, y=burst_size)) +
    geom_pointdensity() +
    scale_color_viridis() +
    annotate(geom="label", label.size=NA, x=-Inf, y=Inf, hjust=0, vjust=1, label=txt, parse=T, fill=NA) +
    # stat_smooth(method = 'lm') +
    xlim(0, 10) +
    ylim(0, 10) +
    xlab("Burst frequency (per hour)") +
    ylab("Burst size") +
    theme(legend.position = c(0.875, 0.75),   legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank())
out
ggsave(out, filename = "../plots/Burst_size_vs_burst_frequency_Genes.pdf", width=4, height=4);

txt = lm_eqn(test[test$Type == "Enhancer", ]$burst_rate, test[test$Type == "Enhancer", ]$burst_size)
out = test %>%
    filter( Type == "Enhancer") %>%
    ggplot(aes(x=burst_rate, y=burst_size)) +
    geom_pointdensity() +
    scale_color_viridis() +
    annotate(geom="label", label.size=NA, x=-Inf, y=Inf, hjust=0, vjust=1, label=txt, parse=T, fill=NA) +
    # stat_smooth(method = 'lm') +
    xlim(0, 10) +
    ylim(0, 10) +
    xlab("Burst frequency (per hour)") +
    ylab("Burst size") +
    theme(legend.position = c(0.875, 0.75),   legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank())
out
ggsave(out, filename = "../plots/Burst_size_vs_burst_frequency_Enhancers.pdf", width=4, height=4);

In [None]:
# THIS LENGTH EFFECT ON BURST KINETICS IS ONLY WHEN GENES ARE NOT SELECTED FOR 10 KB:
txt = lm_eqn(bursts[bursts$Type == "Gene", ]$burst_rate, bursts[bursts$Type == "Gene", ]$length)
out = bursts %>%
    filter( Type == "Gene") %>%
    ggplot(aes(x=length, y=burst_rate)) +
    # ggplot(aes(x=length, y=1+(N_reads/N_bursts-1)/capture_efficiency)) +
    geom_pointdensity() +
    scale_color_viridis() +
    annotate(geom="label", label.size=NA, x=-Inf, y=Inf, hjust=0, vjust=1, label=txt, parse=T, fill=NA) +
    # stat_smooth(method = 'lm') +
    # scale_x_log10(limits = c(1000, 30000)) +
    xlim(0, maxGeneLengthAfterTrim) +
    ylim(0, 40) +
    xlab("Gene length") +
    ylab("Burst frequency (per hour)") +
    theme(legend.position = c(0.875, 0.75),    legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank())
    # facet_wrap(~Type) +
    # theme(strip.background = element_blank(),
    #     strip.text = element_text(size = 14, face="bold"))
out
# ggsave(out, filename = "../plots/BurstFrequency_vs_geneLength_Genes.pdf", width=4, height=4); # _whenGeneNotLimitedTo10kb

txt = lm_eqn(bursts[bursts$Type == "Gene", ]$burst_size, bursts[bursts$Type == "Gene", ]$length)
out = bursts %>%
    filter( Type == "Gene") %>%
    ggplot(aes(x=length, y=burst_size)) +
    geom_pointdensity() +
    scale_color_viridis() +
    annotate(geom="label", label.size=NA, x=-Inf, y=Inf, hjust=0, vjust=1, label=txt, parse=T, fill=NA) +
    # stat_smooth(method = 'lm') +
    # scale_x_log10(limits = c(1000, 30000)) +
    xlim(0, maxGeneLengthAfterTrim) +
    ylim(0, 8) +
    xlab("Gene length") +
    ylab("Burst size") +
    theme(legend.position = c(0.875, 0.75),    legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=0), legend.background = element_blank())
    # facet_wrap(~Type) +
    # theme(strip.background = element_blank(),
    #     strip.text = element_text(size = 14, face="bold"))
out
# ggsave(out, filename = "../plots/BurstSize_vs_geneLength_Genes.pdf", width=4, height=4); # _whenGeneNotLimitedTo10kb

In [None]:
bursts %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=Type, y = burst_rate, fill = Type)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.25, 0.5, 0.75)) +
    #geom_jitter(height = 0, width = 0.4, alpha=0.1, size=0.5) +
    scale_y_log10(limits = c(0.05, 5)) +
    ggtitle("Bursts from 10kb gene body") +
    ylab("Bursts per hour") +
    guides(fill=guide_legend(title=""))

In [None]:
bursts %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=burst_rate_5k, y=burst_size_5k)) +
    geom_pointdensity(show.legend = F) +
    scale_color_viridis() +
    annotate(geom = "label", x = 6, y = 11, label = txt, parse=T) +
    #stat_smooth(method = 'lm') +
    xlim(0, 10) +
    ylim(0, 12) +
    xlab("Estimated burst rate from 5kb \n of gene body (per hour)") +
    ylab("Estimated burst size")

In [None]:
txt = lm_eqn( bursts$N_reads, bursts$burst_rate );

bursts %>%
    filter(Type == "Gene") %>%
    ggplot( aes(x=burst_rate, y=N_reads) ) +
    geom_pointdensity(show.legend = F) +
    annotate(geom = "label", x = 0.3, y = 3e4, label = txt, parse=T) +
    scale_x_log10() +
    scale_y_log10() +
    scale_color_viridis() +
    xlab("Burst frequency from 10kbp gene body") +
    ylab("Number of reads in gene")

In [None]:
txt = lm_eqn( bursts$N_bursts, bursts$N_reads );

bursts %>%
    filter(Type == "Gene") %>%
    ggplot( aes(x=N_bursts, y=N_reads) ) +
    geom_pointdensity(show.legend = F) +
    annotate(geom = "label", x = 0.3, y = 3e4, label = txt, parse=T) +
    xlim(10, 200) +
    ylim(10, 200) +
    scale_color_viridis() +
    xlab("Number of cells transcribing") +
    ylab("Number of reads in gene")

In [None]:
txt = lm_eqn( bursts$N_reads, bursts$burst_rate );

bursts %>%
    filter(Type == "Enhancer") %>%
    ggplot( aes(x=burst_rate, y=N_reads) ) +
    geom_pointdensity(show.legend = F) +
    annotate(geom = "label", x = 0.3, y = 3e4, label = txt, parse=T) +
    scale_x_log10() +
    scale_y_log10() +
    scale_color_viridis() +
    xlab("Burst frequency") +
    ylab("Number of reads in enhancer")

In [None]:
bursts %>%
    ggplot( aes(x=burst_rate, y=N_bursts) ) +
    geom_pointdensity(show.legend = F) +
    scale_x_log10() +
    scale_y_log10() +
    scale_color_viridis() +
    xlab("Burst frequency from 10kbp gene body") +
    ylab("Number of bursts")

In [None]:
bursts %>%
    filter(Type == "Gene") %>%
    ggplot( aes(x=burst_rate, y=N_bursts) ) +
    geom_pointdensity(show.legend = F) +
    ylim(0, 1000) +
    scale_x_log10() +
    #scale_y_log10(limits=c(0.1, 100)) +
    scale_color_viridis() +
    ggtitle("Genes, 10kbp") +
    xlab("Bursts per hour") +
    ylab("Number of bursts")

In [None]:
bursts %>%
    filter(Type == "Enhancer") %>%
    ggplot( aes(x=burst_rate, y=N_bursts) ) +
    geom_pointdensity(show.legend = F) +
    scale_x_log10() +
    ylim(0, 1000) +
    #scale_y_log10(limits=c(0.1, 100)) +
    scale_color_viridis() +
    ggtitle("Enhancers") +
    xlab("Bursts per hour") +
    ylab("Number of bursting cells")

## Promoter elements & burst frequencies

In [None]:
tata = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_with_TATA-box_in_promoters.bed");
names(tata) = paste0("GN-", sub("_1", "", tata$name));
tata
"TATA"
length(tata)

notata = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_withOUT_TATA-box_in_promoters.bed");
names(notata) = paste0("GN-", sub("_1", "", notata$name));
"No TATA"
length(notata)

inr = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_with_Inr_in_promoters.bed");
names(inr) = paste0("GN-", sub("_1", "", inr$name));
"Inr"
length(inr)

noinr = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_withOUT_Inr_in_promoters.bed");
names(noinr) = paste0("GN-", sub("_1", "", noinr$name));
"No Inr"
length(noinr)

gc = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_with_GC-box_in_promoters.bed");
names(gc) = paste0("GN-", sub("_1", "", gc$name));
"GC"
length(gc)

nogc = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_withOUT_GC-box_in_promoters.bed");
names(nogc) = paste0("GN-", sub("_1", "", nogc$name));
"No GC"
length(nogc)

tatainr = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_with_TATA-box-AND-Inr_in_promoters.bed");
names(tatainr) = paste0("GN-", sub("_1", "", tatainr$name));
"TATA + Inr"
length(tatainr)

notatainr = read_bed("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/mm10_genes_withOUT_TATA-box-AND-Inr_in_promoters.bed");
names(notatainr) = paste0("GN-", sub("_1", "", notatainr$name));
"No TATA + No Inr"
length(notatainr)

In [None]:
PauseButton = readRDS("~/group/mES_PROseq_RNAseq_ATACseq/scRNAseq_mES_Larrson_Sandberg_2019/CoPRO_MEF_NonHS_PauseButtons.rds") %>%
    filter( PauseButton );
PauseButton

# define promoter regions of features
gene_promoters = promoters(genes, upstream = 100, downstream = 400 );
gene_promoters
# active_promoters = findOverlaps( gene_promoters, pb);
# active_promoters
pb = subsetByOverlaps( gene_promoters, PauseButton, ignore.strand=F);
pb
nopb = subsetByOverlaps( gene_promoters, PauseButton, ignore.strand=F, invert=T );
nopb
# # overlap with dREG to identify which promoters are active
# hasActivePro = queryHits(active_promoters) %>% unique();
# active_isoforms = ref_transcripts[ hasActivePro ];
# active_isoforms;

In [None]:
length(features[names(features) %in% names(tata)])
summary(width(features[names(features) %in% names(tata)]))
length(features[names(features) %in% names(notata)])
summary(width(features[names(features) %in% names(notata)]))

length(features[names(features) %in% names(tatainr)])
summary(width(features[names(features) %in% names(tatainr)]))
length(features[names(features) %in% names(notatainr)])
summary(width(features[names(features) %in% names(notatainr)]))

length(features[names(features) %in% names(pb)])
summary(width(features[names(features) %in% names(pb)]))
length(features[names(features) %in% names(nopb)])
summary(width(features[names(features) %in% names(nopb)]))

In [None]:
# tata = allBursts[allBursts$feature %in% names(tata),];
# dim(tata)
# notata = allBursts[allBursts$feature %in% names(notata),];
# dim(notata)
# inr = allBursts[allBursts$feature %in% names(inr),];
# dim(inr)
# noinr = allBursts[allBursts$feature %in% names(noinr),];
# dim(noinr)
# gc = allBursts[allBursts$feature %in% names(gc),];
# dim(gc)
# nogc = allBursts[allBursts$feature %in% names(nogc),];
# dim(nogc)
# tatainr = allBursts[allBursts$feature %in% names(tatainr),];
# dim(tatainr)
# notatainr = allBursts[allBursts$feature %in% names(notatainr),];
# dim(notatainr)

In [None]:
burstsWpElement = bursts %>%
    # mutate( pElement = ifelse(feature %in% names(tata), "TATA", "NA") );
    mutate( GC = case_when(
        feature %in% names(gc) ~ "GC",
        feature %in% names(nogc) ~ "No_GC",
         TRUE ~ NA_character_)
    ) %>%
    mutate( TATA = case_when(
        feature %in% names(tata) ~ "TATA",
        feature %in% names(notata) ~ "No_TATA",
        TRUE ~ NA_character_)
    ) %>%
    mutate( Inr = case_when(
        feature %in% names(inr) ~ "Inr",
        feature %in% names(noinr) ~ "No_Inr",
        TRUE ~ NA_character_)
    ) %>%
    mutate( TATA_Inr = case_when(
        feature %in% names(tatainr) ~ "TATA_Inr",
        feature %in% names(notatainr) ~ "No_TATA_Inr",
        TRUE ~ NA_character_)  
    ) %>%
    mutate( PB = case_when(
        feature %in% names(pb) ~ "PB",
        feature %in% names(nopb) ~ "No_PB",
        TRUE ~ NA_character_)
    );
burstsWpElement[1:5,]

In [None]:
myorder = c("TATA", "No_TATA", "Inr", "No_Inr", "TATA_Inr", "No_TATA_Inr", "PB", "No_PB");
allElements = burstsWpElement %>%
    pivot_longer(TATA:PB, names_to="Source", values_to="Group") %>%
    filter(!is.na(Group)) %>%
    mutate(Group=factor(Group, levels=myorder))
allElements[1:5,]

In [None]:
tataenrich = allElements %>%
    filter(burst_size > 1) %>%
    ggplot(aes(x=Group, y = burst_size, fill = Group)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    # scale_fill_brewer(palette = "Set3") +
    # theme(axis.text.x = element_text(angle=30, hjust = 1)) +
    scale_fill_manual(values=c("#3e6790","#6485a6", "#646673", "#868892", "#b0863d", "#bf9e63", "#2b6464", "#558383")) +
    scale_y_log10(limits = c(1, 4)) +
    # ggtitle("Role of promoter elements in Bursts") +
    xlab("") +
    ylab("Bursts size");
tataenrich
ggsave(tataenrich, filename = "../plots/Promoter_elements_burst_size.pdf", width=6.5, height=4);

In [None]:
# t-test tests mean:
t.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));
t.test(burst_size ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));
t.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));
t.test(burst_size ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

# f-test for difference in variance:
var.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));
var.test(burst_size ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));
var.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));
var.test(burst_size ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

In [None]:
tataenrich = allElements %>%
    # filter(burst_size > 1) %>%
    ggplot(aes(x=Group, y = burst_rate, fill = Group)) +
    # geom_jitter(height = 0, width = 0.4, alpha=0.1, size=0.5, shape=21, color="grey") +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    # scale_fill_brewer(palette = "Set3") +
    # theme(axis.text.x = element_text(angle=30, hjust = 1)) +
    # scale_fill_manual(values=c("#3e6790","#6485a6", "#646673", "#868892", "#2b6464", "#558383","#a38071", "#d6b3a4",  "#b0863d", "#bf9e63")) +
    scale_fill_manual(values=c("#3e6790","#6485a6", "#646673", "#868892", "#b0863d", "#bf9e63", "#2b6464", "#558383")) +
    scale_y_log10(limits = c(0.05, 10)) +
    # ggtitle("Role of promoter elements in Bursts") +
    xlab("") +
    ylab("Burst frequency (per hour)");
tataenrich
ggsave(tataenrich, filename = "../plots/Promoter_elements_burst_frequency.pdf", width=6.5, height=4);

In [None]:
ks.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"), alternative = "greater", exact=F);
wilcox.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));

ks.test(burst_rate ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"), alternative = "greater", exact=F);
wilcox.test(burst_rate ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));

ks.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"), alternative = "greater", exact=F);
wilcox.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));

ks.test(burst_rate ~ Group, allElements, subset = Group %in% c("PB", "No_PB"), alternative = "greater", exact=F);
wilcox.test(burst_rate ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

In [None]:
# t-test tests mean:
t.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));
t.test(burst_rate ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));
t.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));
t.test(burst_rate ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

# f-test for difference in variance:
var.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));
var.test(burst_rate ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));
var.test(burst_rate ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));
var.test(burst_rate ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

In [None]:
tataenrich = allElements %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=Group, y = 1/burst_rate, fill = Group)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    # scale_fill_brewer(palette = "Set3") +
    # theme(axis.text.x = element_text(angle=30, hjust = 1)) +
    scale_fill_manual(values=c("#3e6790","#6485a6", "#646673", "#868892", "#b0863d", "#bf9e63", "#2b6464", "#558383")) +
    scale_y_log10(limits = c(0.1, 100)) +
    # ggtitle("Role of promoter elements in Bursts") +
    xlab("") +
    ylab("Time between bursts");
tataenrich
ggsave(tataenrich, filename = "../plots/Promoter_elements_burst_interval.pdf", width=6.5, height=4);

In [None]:
ks.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"), alternative = "greater", exact=F);
wilcox.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA", "No_TATA"));

ks.test(burst_size ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"), alternative = "greater", exact=F);
wilcox.test(burst_size ~ Group, allElements, subset = Group %in% c("Inr", "No_Inr"));

ks.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"), alternative = "greater", exact=F);
wilcox.test(burst_size ~ Group, allElements, subset = Group %in% c("TATA_Inr", "No_TATA_Inr"));

ks.test(burst_size ~ Group, allElements, subset = Group %in% c("PB", "No_PB"), alternative = "greater", exact=F);
wilcox.test(burst_size ~ Group, allElements, subset = Group %in% c("PB", "No_PB"));

In [None]:
tata_df = burstsWpElement[burstsWpElement$TATA %in% c("TATA", "No_TATA"),];
ptest = ks.test(burst_rate ~ TATA, tata_df, alternative = "less", exact=F);
ptest
tataenrich = tata_df %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=TATA, y = burst_rate, fill = TATA)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.01, 10)) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("") +
    ylab("Bursts per hour");

tataenrich
ggsave(tataenrich, filename = "../plots/burstFreq_TATA.pdf", width=4, height=4);

tataenrich = tata_df %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=TATA, y = 1/burst_rate, fill = TATA)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.1, 100)) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("") +
    ylab("Time until burst (hour)");
tataenrich
ggsave(tataenrich, filename = "../plots/TimeBetweenBursts_TATA.pdf", width=4, height=4);

tata_df %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=burst_rate, color=TATA)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("Bursts per hour") +
    ylab("Fraction of genes");


In [None]:
tata_df = burstsWpElement[burstsWpElement$TATA %in% c("TATA", "No_TATA"),];
ptest = ks.test(burst_rate_5k ~ TATA, tata_df, alternative = "less", exact=F);
ptest
tata_df %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=TATA, y = burst_rate_5k, fill = TATA)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.01, 10)) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("") +
    ylab("Bursts per hour from 5K");

tata_df %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=burst_rate_5k, color=TATA)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("Bursts per hour from 5k") +
    ylab("Fraction of genes");

In [None]:
ptest = ks.test(burst_size ~ TATA, tata_df, alternative = "less", exact=F);
ptest
tataenrich = tata_df %>%
    filter(burst_size > 1) %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=TATA, y = burst_size, fill = TATA)) +
    # geom_jitter(
    #     height = 0, width = 0.4, alpha=0.1, size=0.5, shape=21, color="grey") +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(1, 5)) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("") +
    ylab("Burst size");
tataenrich
ggsave(tataenrich, filename = "../plots/BurstSize_TATA.pdf", width=4, height=4);

tata_df %>%
    filter(burst_size > 1) %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=burst_size, color=TATA)) +
    scale_x_log10(limits = c(1, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("Burst size") +
    ylab("Fraction of genes");

In [None]:
ptest = ks.test(burst_size_5k ~ TATA, tata_df, alternative = "less", exact=F);
ptest
tata_df %>%
    filter(burst_size_5k > 1) %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=TATA, y = burst_size_5k, fill = TATA)) +
    geom_jitter(
        height = 0, width = 0.4, alpha=0.1, size=0.5, shape=21, color="grey") +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(1, 5)) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("") +
    ylab("Burst size from 5k");

tata_df %>%
    filter(burst_size_5k > 1) %>%
    # filter(N_bursts >= 10 & burst_size > 1) %>%
    ggplot(aes(x=burst_size_5k, color=TATA)) +
    scale_x_log10(limits = c(1, 5)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA element in Bursts") +
    xlab("Burst size from 5k") +
    ylab("Fraction of genes");

In [None]:
ptest = ks.test(length ~ TATA, tata_df, alternative = "less", exact=F);
ptest
tata_df %>%
# filter(N_bursts >= 5 & N_cells_on >= 10) %>%
    ggplot(aes(x=TATA, y = length, fill = TATA)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(1, maxGeneLengthAfterTrim)) +
    ggtitle("Gene length & TATA") +
    xlab("") +
    ylab("Length of gene");

tata_df %>%
    ggplot(aes(x=length, color=TATA)) +
    scale_x_log10(limits = c(1, maxGeneLengthAfterTrim)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Gene length & TATA") +
    xlab("Gene length") +
    ylab("Fraction of genes");

In [None]:
inr_df = burstsWpElement[burstsWpElement$Inr %in% c("Inr", "No_Inr"),];
ptest = ks.test(burst_rate ~ Inr, inr_df, alternative = "greater", exact=F);
ptest
tataenrich = inr_df %>%
    # filter(N_bursts >= 5 & N_cells_on >= 10) %>%
    ggplot(aes(x=Inr, y = burst_rate, fill = Inr)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.01, 10)) +
    ggtitle("Role of Inr element in Bursts") +
    xlab("") +
    ylab("Bursts per hour");
tataenrich
ggsave(tataenrich, filename = "../plots/BurstFrequency_Inr.pdf", width=4, height=4);

inr_df %>%
    ggplot(aes(x=burst_rate, color=Inr)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of Inr element in Bursts") +
    xlab("Bursts per hour") +
    ylab("Fraction of genes");

In [None]:
gc_df = burstsWpElement[burstsWpElement$GC %in% c("GC", "No_GC"),];
ptest = ks.test(burst_rate ~ GC, gc_df, alternative = "greater", exact=F);
ptest
gc_df %>%
    # filter(N_bursts >= 5 & N_cells_on >= 10) %>%
    ggplot(aes(x=GC, y = burst_rate, fill = GC)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.01, 10)) +
    ggtitle("Role of GC element in Bursts") +
    xlab("") +
    ylab("Bursts per hour");

gc_df %>%
    ggplot(aes(x=burst_rate, color=GC)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of GC element in Bursts") +
    xlab("Bursts per hour") +
    ylab("Fraction of genes");

In [None]:
tata_inr_df = burstsWpElement[burstsWpElement$TATA_Inr %in% c("TATA_Inr", "No_TATA_Inr"),];
ptest = ks.test(burst_rate ~ TATA_Inr, tata_inr_df, alternative = "less", exact=F);
ptest
tatainrenrich = tata_inr_df %>%
    # filter(N_bursts >= 5 & N_cells_on >= 10) %>%
    ggplot(aes(x=TATA_Inr, y = burst_rate, fill = TATA_Inr)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.01, 10)) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("") +
    ylab("Bursts per hour");
tatainrenrich
ggsave(tatainrenrich, filename = "../plots/BurstFrequency_TATA+Inr.pdf", width=4, height=4);

tatainrenrich = tata_inr_df %>%
    # filter(N_bursts >= 5 & N_cells_on >= 10) %>%
    ggplot(aes(x=TATA_Inr, y = 1/burst_rate, fill = TATA_Inr)) +
    # geom_jitter(
        # height = 0, width = 0.4, alpha=0.1, size=0.5, 
        # shape=21, color="grey", position=position_jitter(0.2)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    scale_fill_manual(
        values=c("#39568CFF","#1F968BFF"),
        name="Promoter\nStatus") +
    scale_y_log10(limits = c(0.1, 100)) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("") +
    ylab("time until burst");
tatainrenrich
ggsave(tatainrenrich, filename = "../plots/TimeBetweenBursts_TATA+Inr.pdf", width=4, height=4);

tata_inr_df %>%
    ggplot(aes(x=burst_rate, color=TATA_Inr)) +
    scale_x_log10(limits = c(0.01, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("Bursts per hour") +
    ylab("Fraction of genes");

In [None]:
# # r function to cbind various length columns:
# cbind.fill <- function(...){
#     nm <- list(...)
#     nm <- lapply(nm, as.matrix)
#     n <- max(sapply(nm, nrow))
#     do.call(cbind, lapply(nm, function (x)
#     rbind(x, matrix(, n-nrow(x), ncol(x)))))
# }
# comb = cbind.fill(tata$burst_rate, notata$burst_rate, tatainr$burst_rate, notatainr$burst_rate)

In [None]:
tata_inr_df = burstsWpElement[burstsWpElement$TATA_Inr %in% c("TATA_Inr", "No_TATA_Inr"),];
ptest = ks.test(burst_size ~ TATA_Inr, tata_inr_df, alternative = "greater", exact=F);
ptest

tatainrenrich = tata_inr_df %>%
    filter(burst_size > 1) %>%
    ggplot(aes(x=TATA_Inr, y = burst_size, fill = TATA_Inr)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    # geom_jitter(
    #     height = 0, width = 0.4, alpha=0.1, size=0.5, shape=21, color="red") +
    scale_fill_manual(values=c("#39568CFF","#1F968BFF"), name="Promoter\nStatus") +
    # geom_text(data = median, aes(x = TATA_Inr, y = med, label = med), size = 3, vjust = -1.5) +
    scale_y_log10(limits = c(1, 5)) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("") +
    ylab("Burst size");
tatainrenrich
ggsave(tatainrenrich, filename = "../plots/BurstSize_TATA+Inr.pdf", width=4, height=4);

tata_inr_df %>%
    filter(burst_size > 1) %>%
    ggplot(aes(x=burst_size, color=TATA_Inr)) +
    scale_x_log10(limits = c(1, 5)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("Burst size") +
    ylab("Fraction of genes");

In [None]:
dataMedian =  summarise(group_by(tata_inr_df, TATA_Inr), MD = mean(round(burst_size, digits=2)))
dataMedian

In [None]:
tata_inr_df = burstsWpElement[burstsWpElement$TATA_Inr %in% c("TATA_Inr", "No_TATA_Inr"),];
ptest = ks.test(meanExpPerKb ~ TATA_Inr, tata_inr_df, alternative = "greater", exact=F);
ptest

tata_inr_df %>%
    # filter(burst_size > 1) %>%
    ggplot(aes(x=TATA_Inr, y = meanExpPerKb, fill = TATA_Inr)) +
    geom_violin(show.legend = F, draw_quantiles = c(0.5)) +
    # geom_jitter(
    #     height = 0, width = 0.4, alpha=0.1, size=0.5, shape=21, color="red") +
    scale_fill_manual(values=c("#39568CFF","#1F968BFF"), name="Promoter\nStatus") +
    # geom_text(data = median, aes(x = TATA_Inr, y = med, label = med), size = 3, vjust = -1.5) +
    scale_y_log10(limits = c(0.001, 1)) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("") +
    ylab("Mean Expression");

tata_inr_df %>%
    # filter(burst_size > 1) %>%
    ggplot(aes(x=meanExpPerKb, color=TATA_Inr)) +
    scale_x_log10(limits = c(1, 10)) +
    guides(color=guide_legend(title="")) +
    scale_color_manual( values = c("#39568CFF","#1F968BFF") ) +
    stat_ecdf(geom="step", linewidth=1.5) +
    ggtitle("Role of TATA+Inr element in Bursts") +
    xlab("Mean Expression") +
    ylab("Fraction of genes");

In [None]:
head(tata_inr_df)