In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(ggExtra)
    library(matrixStats)
    library(plyranges)
    library(viridis)
    library(data.table)
    library(ggbio)
    library(ggridges)
    library(ggbeeswarm)
    library(ggpointdensity)
});
source("./scGRO_functions.r");

In [None]:
options(
    repr.plot.width=4,
    repr.plot.height=12,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# Get equation and r^2 as string
# https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA

lm_eqn = function(x, y) {
    m = lm(y ~ x);
    eq = substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

In [None]:
#features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
features = read_bed("../data/dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
names(features) = features$name;
features$score=NULL;
features

# # select genes longer than 5kp from each feature
# # truncate genes longer than 30kb to 30kb
# features = features %>%
#     filter( width(features) >= 3000 | substr(name, 0, 2) != "GN" ) %>%
#     anchor_center() %>%
#     mutate(width=width-1000);
# length(features)

# longf = which( width(features) > 10000 );
# features[longf] = features[longf] %>%
#      resize( width = 10000, fix="start" );
# length(features)

In [None]:
scGRO  = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS( "../data/scGROv2p8_mapq3qc_filtered_counts.rds" );
# scGRO  = readRDS("../jay_m/data_jay_m/scGROv2p9_filtered_consolidated.rds");
# counts = readRDS( "../jay_m/data_jay_m/scGROv2p9_mapq3qc_filtered_counts.rds" );
# hist(colSums(counts))
# head(scGRO)
# head(counts)

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( umiQC & plateQC & cellQC & countQC & miRQC ) %>%
    mutate(cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    # filter( !(Exp == "Exp236" & Plate %in% paste0("c0", 5:8)) ) %>%
    # filter( !(Exp == "Exp260b") ) %>%
    resize(width=1, fix="end") %>%
    filter(cellID %in% colnames(counts)) %>%
    # subsetByOverlaps(features) %>%
    select( cellID );
names(scGRO) = NULL;
scGRO

In [None]:
scrambled = scGRO;
scrambled$cellID = droplevels(scrambled$cellID);
scrambled$cellID = sample(scrambled$cellID);

In [None]:
features
features["GN-Nanog"]
width(features["GN-Pou5f1"])
width(features["GN-Sox2"])
width(features["GN-Nanog"])

In [None]:
plot_polymerase_view( scGRO, features["GN-Npm1"], max.cells=200, min.rpc=2, sortcells = F );
plot_polymerase_view( scrambled, features["GN-Npm1"], max.cells=200, min.rpc=2, sortcells = F );
# ggsave(filename="../plots/Npm1_PolPosPerCell.pdf", width=4, height=8, units="in")

# plot_polymerase_view( scGRO, GRanges("chr11:33143012-33166451:-"), max.cells=500, min.rpc=2, sortcells = F );
plot_polymerase_view( scGRO, GRanges("chr11:33148622-33164684:-"), max.cells=500, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_extended_PolPosPerCell.pdf", width=4, height=12, units="in")
plot_polymerase_view( scrambled, GRanges("chr11:33148622-33164684:-"), max.cells=500, min.rpc=2, sortcells = F );

In [None]:
source("./scGRO_functions.r");
# plot_polymerase_view( scGRO, features["GN-Hells"], max.cells=2000, min.rpc=2, sortcells = F );
plot_polymerase_view( scGRO, GRanges("chr19:38923054-38999071:+"), max.cells=2000, min.rpc=1, sortcells = F );
# ggsave(filename="../plots/Hells_PolPosPerCell.pdf", width=8, height=8, units="in")
plot_polymerase_view( scGRO, GRanges("chr19:38923054-38999071"), max.cells=2000, min.rpc=1, sortcells = F );
# ggsave(filename="../plots/Hells_PolPosPerCell_2plus.pdf", width=8, height=8, units="in")

In [None]:
#Enhancer:
plot_polymerase_view( scGRO, GRanges("chr3:96433609-96438833"), max.cells=2000, min.rpc=1, sortcells = F );
# ggsave(filename="../plots/Enhancer_chr3_PolPosPerCell_shortGraph.pdf", width=8, height=8, units="in")

In [None]:
# plot_polymerase_view( scrambled, features["GN-Pnn"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Gadd45a"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Ubald1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Mars2"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Apex1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Polr3d"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Psmb4"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Rpl38"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Rpl24"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Ssrp1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Ddx39"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Cdt1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Rbm15"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Purb"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Arf6"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Tuba4a"], max.cells=1000, min.rpc=1, sortcells = F );

In [None]:
plot_polymerase_view( scrambled, GRanges("chr11:33143012-33166451:-"), max.cells=100, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_PolPosPerCell_scrambled_100cells.pdf", width=4, height=8, units="in")

plot_polymerase_view( scGRO, GRanges("chr11:33143012-33166451:-"), max.cells=150, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_PolPosPerCell_150cells.pdf", width=4, height=8, units="in")
                     
plot_polymerase_view( scGRO, GRanges("chr11:33143012-33166451:-"), max.cells=100, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_PolPosPerCell_100cells.pdf", width=4, height=8, units="in")
                     
plot_polymerase_view( scGRO, GRanges("chr11:33143012-33166451:-"), max.cells=75, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_PolPosPerCell_75cells.pdf", width=4, height=8, units="in")
                     
plot_polymerase_view( scGRO, GRanges("chr11:33143012-33166451:-"), max.cells=50, min.rpc=2, sortcells = F );
ggsave(filename="../plots/Npm1_PolPosPerCell_50cells.pdf", width=4, height=8, units="in")
# plot_polymerase_view( scrambled, features["GN-Ahsa1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Ptbp1"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Rbm15"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Emc4"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-C1qbp"], max.cells=1000, min.rpc=1, sortcells = F );
# plot_polymerase_view( scrambled, features["GN-Ftsj3"], max.cells=1000, min.rpc=1, sortcells = F );

In [None]:
plot_polymerase_view( scrambled, features["GN-Cobl"], max.cells=2000, min.rpc=1, sortcells = F );
plot_polymerase_view( scGRO, GRanges("chr11:12196530-12510234:+"), max.cells=2000, min.rpc=1, sortcells = F );

In [None]:
#Armc1
plot_polymerase_view( scGRO, GRanges("chr3:19103396-19180323:-"), max.cells=2000, min.rpc=1, sortcells = F );

In [None]:
#Wwtr1
plot_polymerase_view( scGRO, GRanges("chr3:57399333-57601749:-"), max.cells=2000, min.rpc=1, sortcells = F );

In [None]:
plot_polymerase_view( scGRO, features["GN-Nanog"], max.cells=2000, min.rpc=1, sortcells = F );
plot_polymerase_view( scGRO, GRanges("chr6:122700399-122722175"), max.cells=2000, min.rpc=1, sortcells = F );
# ggsave(filename="../plots/scGRO_Nanog_PolPosPerCell.pdf", width=7.5, height=10, units="in")

In [None]:
plot_polymerase_view( scGRO, features["GN-Malat1"], max.cells=100, min.rpc=5, sortcells = F );

In [None]:
plot_polymerase_view( scGRO, features["GN-Malat1"], max.cells=1000, min.rpc=3, sortcells = F );

In [None]:
plot_polymerase_view( scGRO, GRanges("chrX:13038829-13199170:+"), max.cells=2000, min.rpc=1 );

In [None]:
plot_polymerase_view( scGRO, GRanges("chr10:40258291-40258900:+"), max.cells=1000, min.rpc=1, sortcells = F );

In [None]:
plot_polymerase_view( scrambled, features["GN-Vbp1"], max.cells=1000, min.rpc=1);
# GN-Vbp1:
plot_polymerase_view( scGRO, GRanges("chrX:75502070-75563748:+"), sortcells = F );

In [None]:
plot_polymerase_view( scrambled, features["GN-Esrrb"], max.cells=1000, min.rpc=1);

In [None]:
plot_polymerase_view( scrambled, features["GN-Otx2"], max.cells=1000, min.rpc=1);

In [None]:
plot_polymerase_view( scrambled, features["GN-Gli2"], max.cells=100, min.rpc=1, sortcells = F);

In [None]:
plot_polymerase_view( scrambled, features["GN-Exoc4"], max.cells=1000, min.rpc=1, sortcells = F);

In [None]:
plot_polymerase_view( scrambled, features["GN-Pvt1"], max.cells=1000, min.rpc=1, sortcells = F);

In [None]:
plot_polymerase_view( scrambled, features["GN-Actb"], max.cells=1000, min.rpc=1, sortcells = F);

In [None]:
plot_polymerase_view( scrambled, features["GN-Vbp1"], max.cells=1000, min.rpc=1);

In [None]:
plot_polymerase_view( scrambled, features["GN-Vbp1"], max.cells=1000, min.rpc=1);

In [None]:
marginal_corr = function( reads, gene, enh ) {
    query = features[ c(gene, enh) ];
    qstart= promoters(query, upstream=0, downstream=1);
    
    genepol = subsetByOverlaps(reads, query[1]) %>%
        mutate( gene = distanceToNearest(., qstart[1]) );
    genepol$gstrand = as.character(strand(genepol));
    
    enhpol = subsetByOverlaps(reads, query[2]) %>%
        mutate( enh = distanceToNearest(., qstart[2]) );
    enhpol$estrand = as.character(strand(enhpol));

    genepol = mcols(genepol) %>%
        as.data.frame %>%
        filter(cellID %in% enhpol$cellID) %>%
        select(cellID, gene.distance, gstrand);
    
    enhpol  = mcols(enhpol) %>%
        as.data.frame %>%
        filter(cellID %in% genepol$cellID) %>%
        select(cellID, enh.distance, estrand);

    p = left_join(genepol, enhpol, by="cellID") %>%
            ggplot(aes(x=gene.distance/1000, y=enh.distance/1000)) +
            geom_point() +
            xlim(0, 50) +
            ylim(0, 20) +
            xlab("Gene distance transcribed (kb)") +
            ylab("Enhancer distance transcribed (kb)") +
            theme(legend.position="none");

    ggMarginal(p, type="histogram")
}

In [None]:
marginal_corr( scGRO, "GN-Sox2", "Sox2_105kbDn_pl" );

In [None]:
marginal_corr( scGRO, "GN-Pou5f1", "Pou5f1_25kbUp_mn" );