In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

## 1. Load scGRO-seq reads and custom feature annotations

In [None]:
suppressMessages({
    library(tidyverse)
    library(scales)
    library(foreach)
    library(doParallel)
    library(data.table)
    library(rtracklayer)
    library(plyranges)
    library(ggpointdensity)
});
source("./scGRO_functions.r")

In [None]:
options(
    repr.plot.width=4,
    repr.plot.height=4,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
length_limit = 10000;

In [None]:
#features = readRDS("../data/mES_BRsComb_genes_v1.rds");
# features = import.bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_v1.bed");
features = read_bed("../data/dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
names(features) = features$name;
#features$score=NULL;
length(features)

# require features to be longer than 3kbp,
# and trim 1kbp from each end
features = features %>%
    filter( substr(name, 0, 2) == "GN" ) %>%
    #filter(dREG) %>%
    filter( width >= length_limit+1000 ) %>%
    anchor_center() %>%
    mutate(width=width-1000);
length(features)

# truncate genes longer than 10kb to 10kb
longf = which( width(features) > length_limit );
features[longf] = features[longf] %>%
    resize( width = length_limit, fix="start" );
summary(width(features))

In [None]:
# identify features overlapping a strong dREG peak
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
dim(dREG)
colnames(dREG) = c("chr", "start", "end", "score", "pval", "center");
dREG = dREG %>%
    filter(pval < 0.01) %>%
    GRanges;
summary(width(dREG))
# remove features containing a dREG peak more than 3kbp from promoter
fbody  = features %>%
     filter(width >= 3000) %>%
     # filter(width <= length_limit) %>%
     anchor_3p() %>%
     mutate(width=width-2000);
dREG_genes = subsetByOverlaps(fbody, dREG) %>%
    names %>%
    features[.];
length(dREG_genes)

In [None]:
scGRO  = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS( "../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds" );
# scGRO  = readRDS("../jay_m/data_jay_m/scGROv2p9_filtered_consolidated.rds");
# counts = readRDS( "../jay_m/data_jay_m/scGROv2p9_mapq3qc_filtered_counts.rds" );
data.frame(x=colSums(counts)) %>%
    ggplot(aes(x=x)) +
    geom_histogram(binwidth=100, boundary=0) +
    xlab("Reads per cell")
# head(scGRO)
# head(counts)

In [None]:
# counts = counts[,colSums(counts) >= 1000 & colSums(counts) <= 2000];
dim(counts)

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( umiQC & miRQC & countQC & cellQC & plateQC ) %>%
    # filter( umiQC & miRQC ) %>%
    #filter( !(Exp == "Exp236" & Plate %in% paste0("c0", 5:8)) ) %>%
    # filter( !(Exp == "Exp260b") ) %>%
    mutate(cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    resize(width=1, fix="end") %>%
    select(cellID) %>%
    filter(cellID %in% colnames(counts)) %>%
    subsetByOverlaps(features) %>% #, ignore.strand=T) %>%
    sort();

In [None]:
# mark features for each read
hits = findOverlaps( scGRO, features ); #, ignore.strand=T );
scGRO$gene = NA;
scGRO$gene[hits@from] = names(features)[hits@to];
#scGRO$gene = names(features)[scGRO$gene];

In [None]:
# filter such that each gene is expressed in at least 10 cells
scGRO = scGRO %>%
    group_by(gene) %>%
    filter(n_distinct(cellID) >= 10) %>%
    ungroup();
scGRO

## 2. To Compute distances between molecules in same cell & same gene
### If already calculated, which takes long time, just read the csv file below:

In [None]:
# drop empty cellIDs before permuting
scGRO$cellID = droplevels(scGRO$cellID);
TSS = resize(features, width=1, fix="start");

Nperm=200;
cells=levels(scGRO$cellID);
ccounts=table(scGRO$cellID);

In [None]:
# summarize the reads for thier companion in same gene same cell:
obs = scGRO
obs_counts = as.data.table(obs) %>%
    group_by(gene, cellID) %>%
    summarise(N=n(), .groups = "drop") %>%
    select(N) %>%
    table;
obs_counts

# calculate distance between reads in same gene same cell:
# the distances are contributed from reads that belong to more than "1" group from obs_counts:
obs = obs[ order(obs$gene, obs$cellID, start(obs)) ] %>%
    distance_to_neighbor() %>%
    filter(distance > 50 & distance <= length_limit);
obs$source = "Observed";
obs[1:4,]

In [None]:
obs_counts = as.data.frame(obs_counts);
obs_counts$N = as.integer(obs_counts$N);
obs_counts$simID = 0;
colnames(obs_counts)[2] = "Ns";
obs_counts

# make 4th row as the 4+ Pol II:
obs_counts_table = rbind(obs_counts[1:2, 2:3], colSums(obs_counts[3:8, 2:3]));
rownames(obs_counts_table) = 1:3
obs_counts_table

In [None]:
# permute cellIDs to generate a "simulated" dataset
sim = rep(scGRO, Nperm);
sim$cellID = sample(cells, size=length(sim), replace=T, prob=ccounts/sum(ccounts)) %>% as.factor;
sim$simID = rep(1:Nperm, each=length(scGRO));

In [None]:
sim_counts = as.data.table(sim) %>%
    group_by(simID, gene, cellID) %>%
    summarise(N=n(), .groups="drop");

dim(sim_counts)
sim_counts[1:4, ]

In [None]:
sim_counts_table = sim_counts %>%
    as.data.table %>%
    group_by(simID, N) %>%
    summarise(Ns=dplyr::n(), .groups="drop") %>%
    # gives each simulation a column, with rows as the N (1:10 ...)
    pivot_wider(., names_from = simID, values_from = Ns) %>%
    arrange( N ) %>%
    select( -N );

# replace NA with 0:
sim_counts_table[is.na(sim_counts_table)] = 0
sim_counts_table

# make 4th row as the 4+ Pol II:
sim_counts_table = rbind(sim_counts_table[1:2, ], colSums(sim_counts_table[3:14, ]));
rownames(sim_counts_table) = 1:3
sim_counts_table

In [None]:
# make a data frame with obs and simulated Pol II molecues at various density
all_counts = data.frame(
    # N = obs_counts$N,
    obs = obs_counts_table$Ns,
    sim = round(rowMeans(sim_counts_table), digits = 2),
    sd_sim = round(apply(sim_counts_table, 1, sd), digits = 2)
    ) %>%
    mutate( obs_sim = (obs - sim)/obs*100 ) %>%
    mutate( emp_pVal = rowSums(sim_counts_table > obs)/200*100 )
all_counts

fwrite(all_counts, file="../data/PolII_frequency_per_gene_per_cell_observed_200simulations.csv")

In [None]:
# read the previosly written all_counts file. sim takes long time therefore wrote the file:
all_counts = fread("../data/PolII_frequency_per_gene_per_cell_observed_200simulations.csv")
all_counts

In [None]:
all_counts_df = data.frame(
    # N = obs_counts$N,
    class = rep(c("observed", "permuted"), each=3),
    N = rep(1:3, n=2),
    Ns = c(all_counts$obs, all_counts$sim),
    obs_sim = c(all_counts$obs/all_counts$sim, all_counts$sim/all_counts$sim),
    sd = c(rep(0, each=3), all_counts$sd_sim/all_counts$sim)
    )
all_counts_df

In [None]:
# all_counts_df = data.frame(
#     # N = obs_counts$N,
#     class = rep(c("observed", "permuted"), each=4),
#     N = rep(1:4, n=2),
#     Ns = c(obs_counts_table$Ns, round(rowMeans(sim_counts_table), digits = 2)),
#     sd = c(rep(0, each=4), round(apply(sim_counts_table, 1, sd), digits = 2))
#     )
# all_counts

In [None]:
out = all_counts_df %>%
    ggplot(aes(x=N, y=obs_sim, fill=class)) +
    geom_bar(stat="identity", position=position_dodge()) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    coord_cartesian(ylim=c(0.75, 1.05)) +
    xlab("Burst size") +
    ylab("Ratio of bursts over Permuted") +
    theme(legend.title = element_text(size=0), legend.background = element_blank()) 
    # facet_wrap(~N, nrow=1, scales = "free_y")
    # ylim(0, 35000) +
    # scale_x_continuous(breaks=2:4, labels=2:4)
out
ggsave(out, filename = "../plots/PolII_burst_sizes_observed_vs_simulation.pdf", width=5, height=4);

out = all_counts_df[1:3, ] %>%
    ggplot(aes(x=N, y=obs_sim, fill=class)) +
    geom_bar(stat="identity", position=position_dodge()) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    coord_cartesian(ylim=c(0.75, 1.05)) +
    xlab("Burst size") +
    ylab("Ratio of bursts \n Observed/Permuted") +
    geom_hline(yintercept=1, color="#666666", linetype=2) +
    theme(legend.position="none") 
    # facet_wrap(~N, nrow=1, scales = "free_y")
    # ylim(0, 35000) +
    # scale_x_continuous(breaks=2:4, labels=2:4)
out
ggsave(out, filename = "../plots/PolII_burst_sizes_observed_vs_simulation_showObservedOnly.pdf", width=4, height=4);

out = all_counts_df %>%
    filter( N == 1) %>%
    ggplot(aes(x=N, y=Ns, fill=class)) +
    geom_bar(width = 0.8, stat="identity", position=position_dodge(width = 0.9)) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    xlab("") +
    ylab("Number of bursts")
    # facet_wrap(~N, nrow=1, scales = "free_y")
out
ggsave(out, filename = "../plots/PolII_burst_size_1_observed_vs_simulation.pdf", width=4, height=4);

out = all_counts_df %>%
    filter( N == 2) %>%
    ggplot(aes(x=N, y=Ns, fill=class)) +
    geom_bar(width = 0.8, stat="identity", position=position_dodge(width = 0.9)) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    xlab("") +
    ylab("Number of bursts")
    # facet_wrap(~N, nrow=1, scales = "free_y")
out
ggsave(out, filename = "../plots/PolII_burst_size_2_observed_vs_simulation.pdf", width=4, height=4);

out = all_counts_df %>%
    filter( N == 3) %>%
    ggplot(aes(x=N, y=Ns, fill=class)) +
    geom_bar(width = 0.8, stat="identity", position=position_dodge(width = 0.9)) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    xlab("") +
    ylab("Number of bursts")
    # facet_wrap(~N, nrow=1, scales = "free_y")
out
ggsave(out, filename = "../plots/PolII_burst_size_3_observed_vs_simulation.pdf", width=4, height=4);

In [None]:
all_counts_df2 = data.frame(
    # N = obs_counts$N,
    class = rep(c("observed", "permuted"), each=2),
    N = rep(1:2, n=2),
    Ns = c(all_counts$obs[1], all_counts$obs[2] + all_counts$obs[3], 
           all_counts$sim[1], all_counts$sim[2] + all_counts$sim[3])
    );

all_counts_df2$obs_sim = c(all_counts_df2$Ns[1]/all_counts_df2$Ns[3], all_counts_df2$Ns[2]/all_counts_df2$Ns[4],
                           all_counts_df2$Ns[3]/all_counts_df2$Ns[3], all_counts_df2$Ns[4]/all_counts_df2$Ns[4]);

all_counts_df2

In [None]:
out = all_counts_df2 %>%
    ggplot(aes(x=N, y=obs_sim, fill=class)) +
    geom_bar(stat="identity", position=position_dodge()) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    # scale_y_log10() +
    geom_hline(yintercept=1, color="#666666", linetype=2) +
    coord_cartesian(ylim=c(0.95, 1.05)) +
    xlab("Burst size") +
    ylab("Ratio of bursts over Permuted") +
    theme(legend.title = element_text(size=0), legend.background = element_blank()) 
    # facet_wrap(~N, nrow=1, scales = "free_y")
    # ylim(0, 35000) +
    # scale_x_continuous(breaks=2:4, labels=2:4)
out
ggsave(out, filename = "../plots/PolII_burst_sizes_1_and_1plus_observed_vs_simulation.pdf", width=5, height=4);

In [None]:
# calculate distance between reads in same gene same cell in simulated data:
sim = sim[ order(sim$simID, sim$gene, sim$cellID, start(sim)) ] %>%
    distance_to_neighbor() %>%
    filter(distance > 50 & distance <= length_limit);
sim[1:4,]

In [None]:
# # summarize the distribution of single and multiple reads per gene per cell in obs vs sim data:
# gcounts = left_join(
#         as.data.frame(obs_counts, stringsAsFactors = F),
#         as.data.frame(sim_counts, stringsAsFactors = F),
#         by="N", suffix = c(".Obs", ".Sim")
#     ) %>%
#     mutate(N=as.integer(N)) %>%
#     pivot_longer(2:3, values_to = "count", names_to="source") %>%
#     mutate( source=substr(source, 6, 15) );
# gcounts$count[gcounts$source == "Sim"] = gcounts$count[gcounts$source == "Sim"]/Nperm;
# gcounts[1:4, ]

In [None]:
# # Plot to see if the obs data has more multiple reads per gene per cell than simulated:
# gcounts %>%
#     ggplot(aes(x=N, y=count, color=source)) +
#     geom_point(position="identity", stat="identity", alpha=0.5) +
#     scale_y_log10(limits=c(1,1e6)) +
#     xlim(1, 10) +
#     xlab("Reads per gene per cell") +
#     ylab("Total incidence") +
#     theme(legend.position = c(0.8, 0.9)) +
#     guides(color=guide_legend(title=""));

# # gcounts %>%
# #     group_by(source) %>%
# #     mutate(freq=count/sum(count, na.rm=T)) %>%
# #     ungroup() %>%
# #     ggplot(aes(x=N, y=freq, color=source)) +
# #     geom_point(position="identity", stat="identity", alpha=0.5) +
# #     scale_y_log10() +
# #     xlim(0, 10) +
# #     xlab("Reads per gene per cell") +
# #     ylab("Fraction of reads") +
# #     theme(legend.position = c(0.8, 0.9)) +
# #     guides(color=guide_legend(title=""));

In [None]:
# make a synthetic data uniform in Pol II distribution:
syn = scGRO %>%
    filter(width(features[gene]) >= length_limit) %>%
    rep(Nperm);
# syn$cellID = sample.int(length(cells), size=length(syn), replace=T, prob=ccounts/sum(ccounts));
syn$simID = rep(1:Nperm, each=length(scGRO));
syn@ranges@start = start(TSS[syn$gene]) + sample.int(length_limit, size=length(syn), replace=T);

syn = syn[ order(syn$simID, syn$gene, syn$cellID, start(syn)) ] %>%
    distance_to_neighbor() %>%
    filter(distance > 50 & distance <= length_limit);
syn$source = "Uniform";
syn[1:4, ]

In [None]:
sim$source = "Permuted";
distances = rbind(obs[rep(1:nrow(obs), each=Nperm),], sim, syn);
dim(distances)

# fwrite(distances,file="../data/scGROv2p8_distances_between_polymerases_observed_permuted_synthetic.csv");

## Read the distances file with obs, permuted, and synthesic uniform:

In [None]:
distances = fread("../data/scGROv2p8_distances_between_polymerases_observed_permuted_synthetic.csv");
dim(distances)
head(distances)

In [None]:
distanceF = distances$distance > 50 & distances$distance < length_limit;
dregF    = !distances$gene %in% names(dREG_genes);
sourceF   = distances$source %in% c("Observed", "Permuted");

# summary(as.factor(distances[,"source"]))

In [None]:
# calculate polymerase density in expressed features
xfeatures = features %>%
    filter(names %in% distances$gene & names %in% rownames(counts)) %>%
    names %>% unique;

pdensity = data.frame(
    density = rowMeans(counts[xfeatures,])/width(features[xfeatures])*1000
);
rownames(pdensity) = xfeatures;
pdensity %>%
    ggplot(aes(x=density)) +
    geom_histogram(bins=40, boundary=0.01) +
    scale_x_log10() +
    xlab("Polymerase per kbp") +
    ylab("Number of features");

In [None]:
generanks = pdensity %>%
    #filter(density >= 0.01) %>%
    mutate(Qrank = ntile(-density, 100));
#table(higenes$Qrank)
dim(generanks)
head(generanks)

expressF = distances$gene %in% rownames(generanks)[generanks$Qrank %in%  6:100];
q1filter = distances$gene %in% rownames(generanks)[generanks$Qrank %in%  2:6  ];
q2filter = distances$gene %in% rownames(generanks)[generanks$Qrank %in%  6:18 ];
q3filter = distances$gene %in% rownames(generanks)[generanks$Qrank %in% 19:100];
sum(q1filter)
sum(q2filter)
sum(q3filter)

In [None]:
out = distances %>%
    filter(dregF) %>%
    # dividing counts by Nperm, because the data contains Nperm times excees
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_histogram_all.pdf", width=4, height=4);

out = distances %>%
    filter(dregF) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Density") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_all.pdf", width=4, height=4);

out = distances %>%
    filter(dregF & distance <= 2500) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    # ylim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_upto2500_histogram_all.pdf", width=4, height=4);


out = distances %>%
    filter(dregF & distance <= 2500) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Density") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_upto2500_density_all.pdf", width=4, height=4);

In [None]:
out = distances %>%
    filter(dregF) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    group_by(source) %>%
    ggplot(aes(x=distance/1000,  y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_histogram_Obs+Per.pdf", width=4, height=4);

out = distances %>%
    filter(dregF & distance <= 2500) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    group_by(source) %>%
    ggplot(aes(x=distance/1000,  y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_upto2500_histogram_Obs+Per.pdf", width=4, height=4);


out = distances %>%
    filter(dregF) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Time between events (hours)") +
    ylab("Number of events") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_Obs+Per.pdf", width=4, height=4);

ks.test(distance ~ source, distances, subset=source %in% c("Observed", "Permuted") & dregF, alternative = "greater", exact=F);


out = distances %>%
    filter(dregF) %>%
    filter( source %in% c("Observed", "Uniform") ) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_histogram_Obs+Uni.pdf", width=4, height=4);

out = distances %>%
    filter(dregF & distance <= 2500) %>%
    filter( source %in% c("Observed", "Uniform") ) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 20) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_upto2500_histogram_Obs+Uni.pdf", width=4, height=4);

out = distances %>%
    filter(dregF) %>%
    filter( source %in% c("Observed", "Uniform") ) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Time between events (hours)") +
    ylab("Number of events") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_Obs+Uni.pdf", width=4, height=4);

ks.test(distance ~ source, distances, subset=source %in% c("Observed", "Uniform") & dregF, alternative = "greater", exact=F);

In [None]:
distances %>%
    filter(q1filter & dregF) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));

distances %>%
    filter(q1filter & dregF & distance <= 2500) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));


out = distances %>%
    filter(q1filter & dregF) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    ggtitle("Top-third expressed genes") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Density of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_Obs+Uni_q1ExpFilter.pdf", width=4, height=4);

out = ks.test(distance ~ source, distances, subset = q1filter & sourceF & dregF, alternative = "greater", exact=F);
out

In [None]:
distances %>%
    filter(q2filter & dregF) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));

distances %>%
    filter(q2filter & dregF & distance <= 2500) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));


out = distances %>%
    filter(q2filter & dregF) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    ggtitle("Mid-third expressed genes") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Density of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_Obs+Uni_q2ExpFilter.pdf", width=4, height=4);

out = ks.test(distance ~ source, distances, subset = q2filter & sourceF & dregF, alternative = "greater", exact=F);
out

In [None]:
distances %>%
    filter(q3filter & dregF) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Density of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));

distances %>%
    filter(q3filter & dregF & distance <= 2500) %>%
    filter( source %in% c("Observed", "Permuted") ) %>%
    ggplot(aes(x=distance/1000, y=after_stat(count/Nperm), fill=source)) +
    geom_histogram(binwidth=0.05, boundary=0, position=position_identity(), alpha=0.6) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    # ggtitle("Consecutive Pol2") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));

out = distances %>%
    filter(q3filter & dregF) %>%
    ggplot(aes(x=distance/1000, fill=source)) +
    geom_density(alpha=0.5) +
    scale_fill_manual( values = c(rgb(0.7,0.1,0.5,0.5), rgb(0,0.2,0.7,0.5), rgb(0.5,0.5,0.5,0.25)) ) +
    theme(legend.position = c(0.8, 0.8)) +
    ggtitle("Bottom-third expressed genes") +
    xlab("Distance between RNA Polymerases (kbp)") +
    ylab("Number of pairs") +
    #xlim(0, 15) +
    guides(fill=guide_legend(title=""));
out
ggsave(out, filename = "../plots/PolII_distances_density_Obs+Uni_q3ExpFilter.pdf", width=4, height=4);

out = ks.test(distance ~ source, distances, subset = q3filter & sourceF & dregF, alternative = "greater", exact=F);
out

In [None]:
pcdf = distances %>%
    filter( source %in% c("Observed", "Uniform")) %>%
    select(distance, source) %>%
    arrange(distance);
Npol = sum(pcdf$source == "Observed");
pcdf = pcdf %>%
    group_by(source) %>%
    mutate(cdf=ecdf(distance)(distance)) %>%
    unique;

observed = filter(pcdf, source=="Observed");
uniform  = filter(pcdf, source=="Uniform");
observed = filter(observed, distance %in% uniform$distance);
uniform  = filter(uniform, distance %in% observed$distance);
observed$diff = observed$cdf - uniform$cdf;
data.frame(observed) %>%
    ggplot(aes(x=distance, y=diff)) +
    ggtitle("Enriched Pol2 spacing (All)") +
    xlab("Distance between molecules (bp)") +
    ylab(paste0("Fraction of pairs (N=", Npol, ")")) +
    geom_hline(yintercept=0, col="#999999") +
    geom_vline(xintercept = 4200, col="#999999") +
    geom_line()

In [None]:
# Observed = distances %>%
#     filter(dregF) %>%
#     filter( source == "Observed") %>%
#     select(distance) %>%
#     fit(Histogram,50:50:10000);
# dim(Observed)

# Uniform = distances %>%
#     filter(dregF) %>%
#     filter( source == "Uniform") %>%
#     select(distance);
# dim(Uniform)

# h1 = fit(Histogram, Observed, 50:50:10000);
# h2 = fit(Histogram, Uniform, 50:50:10000);

# ObsMinusUni = h1.weights .- h2.weights


In [None]:
distances[1:5,]

In [None]:
dobs = distances %>%
    filter(dregF & distance <= 10000 & source=="Observed") %>%
    filter(type %in% c("++", "--")) %>%
    select(distance) %>%
    unlist(use.names=F);
dobs = tabulate(dobs/500, nbins=20);

dper = distances %>%
    filter(dregF & distance <= 10000 & source=="Permuted") %>%
    filter(type %in% c("++", "--")) %>%
    select(distance) %>%
    unlist(use.names=F);
dper = tabulate(dper/500, nbins=20);

out = rbind(dobs, dper) %>%
    t %>%
    as.data.frame %>%
    ggplot(aes(x=(1:20)/2, y=dobs/dper)) +
    # ggtitle("Observed / Permuted") +
    geom_bar(stat="identity", fill=rgb(0.7,0.1,0.5,0.5)) +
    scale_fill_manual( values = rgb(0.7,0.1,0.5,0.5)) +
    geom_hline(yintercept=1, color="black") +
    geom_hline(yintercept=1.1, color="steelblue4") +
    xlab("Distance between molecules (kbp)") +
    ylab("Number of Pol II pairs \n Observed/Permuted") +
    coord_cartesian(ylim=c(0.75, 1.33))
out
ggsave(out, filename = "../plots/PolII_distances_density_Observed-Permuted.pdf", width=6, height=4);

In [None]:
dobs = distances %>%
    filter(dregF & distance <= 10000 & source=="Observed") %>%
    filter(type %in% c("++", "--")) %>%
    select(distance) %>%
    unlist(use.names=F);
dobs = tabulate(dobs/50, nbins=50);

dper = distances %>%
    filter(dregF & distance <= 10000 & source=="Permuted") %>%
    filter(type %in% c("++", "--")) %>%
    select(distance) %>%
    unlist(use.names=F);
dper = tabulate(dper/50, nbins=50);

out = rbind(dobs, dper) %>%
    t %>%
    as.data.frame %>%
    ggplot(aes(x=(1:50)/20, y=dobs/dper)) +
    # ggtitle("Observed / Permuted") +
    geom_bar(stat="identity", fill=rgb(0.7,0.1,0.5,0.5)) +
    scale_fill_manual( values = rgb(0.7,0.1,0.5,0.5)) +
    geom_hline(yintercept=1, color="#666666", linetype=2) +
    # geom_hline(yintercept=1.1, color="yellow") +
    xlab("Distance between RNA polymerases (kb)") +
    ylab("Ratio of RNA polymerase pairs \n Observed/Permuted") +
    coord_cartesian(ylim=c(0.7, 1.3))
out
ggsave(out, filename = "../plots/PolII_distances_density_Observed-Permuted_2500bp.pdf", width=5, height=4);

In [None]:
sum(dobs)/sum(dper)

In [None]:
wilcox.test(dobs, dper)