In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
# .libPaths(c("/home/mahat/.conda/envs/r422/lib/R/library",
#             "/net/bmc-lab2/data/lab/scgroseq/group/software/R/x86_64-pc-linux-gnu-library/4.2"))
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(dplyr)
    library(matrixStats)
    library(foreach)
    library(doParallel)
    library(plyranges)
    library(viridis)
    library(ggpointdensity)
    library(Matrix)
    library(data.table)
    library(Hmisc)
    library(gplots)
    library(rstatix)
    library(scattermore)
    library(igraph)
    library(ggraph)
    library(network)
    library(sna)
    library(statnet)
    library(rtracklayer)
    library(clusterProfiler)
    library(stringr)
    library(org.Mm.eg.db)
    library(circlize)
    library(gplots)
    library(RIdeogram)
    library(enrichplot)
    library(GOSemSim)
    library(dynamicTreeCut)
    library(WGCNA)
    library(ggrastr)
});
source("./scGRO_functions.r");

In [None]:
registerDoParallel(16);
options(
    repr.plot.width=8,
    repr.plot.height=8,
    jupyter.plot_mimetypes = "image/svg+xml",
    digits=5
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");

# load features
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
features$name=NULL;
features$score=NULL;
features

In [None]:
# genes shorter than 1500 are: 2931
# genes shorter than 1000 are: 2209
# genes shorter than  500 are:  484
# genes shorter than  200 are:  147

# short Gm### genes
shortGm = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    filter( width > 200 ) %>%
    filter( width < 1000 & substr(names, 0, 5) == "GN-Gm" );
length(shortGm)
summary(width(shortGm))
names(shortGm)[1:5]

# define highly expressed "blacklist" genes to be
# removed from correlations
blacklist = grep( "GN-Rp[ls]" , names(features) );
blacklist = grep( "GN-Sn[oh]", names(features) ) %>%
    append(blacklist);

blacklist = names(features)[ blacklist ];
length(blacklist)
summary(width(features[ blacklist ]))
blacklist[c(1:5, 110:115)]

In [None]:
# Specify feature length (half at the start and other half at the end)
# to be trimmed to eliminate the effect of paused Pol II at TSS and TES
trimEndLength =  1000;
# specify maximum gene length to be used for G-E correlation:
maxLength = 10000;

In [None]:
# filter genes
genes = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
# remove genes less than 200 nt (mostly Gm genes)
    filter( width >= 200 ) %>%
# filter genes with dREG peaks
    filter( names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
# filter highly expressed genes
    filter(!names %in% blacklist ) %>%
# filter Gm genes shorter than 1 kb
    filter(!names %in% names(shortGm) );

# trim 250 nt from either ends of genes that are longer than 1000
trimGenes = which( width(genes) >= 1000 );
genes[trimGenes] = genes[trimGenes] %>%
# truncate 500 bp in the front and 500 nt in the end of long genes
    anchor_center() %>%
    mutate( width = width - 500);

# trim additional 250 nt from either ends of genes that are now longer than 1000 (originally 1500)
trimGenes = which( width(genes) >= 1000 );
genes[trimGenes] = genes[trimGenes] %>%
# truncate 500 bp in the front and 500 nt in the end of long genes
    anchor_center() %>%
    mutate( width = width - 500);

# trim the long genes to 10 kb
longf = which( width(genes) >= 10000 );
genes[longf] = genes[longf] %>%
    resize( width = 10000, fix="start" );
length(genes)
# summary(width(genes %>% filter(width <1000)))
summary(width(genes))

In [None]:
# # create 10kbp time-bins within each feature
# timebins1 = resize( genes,  width= 10000, fix="start" );

# # intersect with original features to trim excess
# # (some features are <30 kbp)
# timebins1 = pintersect( timebins1, genes );

In [None]:
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS("../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds");
#counts = counts[,colSums(counts) >= 1000 ];
dim(counts)
allcells = colnames(counts);

In [None]:
table(substr(allcells, 0, 6));

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( mapq >= 3 & countQC & umiQC & plateQC & cellQC & miRQC ) %>%
    subsetByOverlaps( genes ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    filter( cellID %in% allcells ) #%>%
    # select( cellID );

scGRO

In [None]:
counts = foreach(
    id = allcells,
    .combine="cbind2"
) %dopar% {
    reads = scGRO %>% filter(cellID == id);
    # count reads in each feature for this cellID
    counts1 = countOverlaps( genes, reads );

    counts = cbind(counts1) %>%
        Matrix(sparse=T);

    return( counts );
}
# columns = cells = cellIDs
colnames(counts) = allcells;
rownames(counts) = names(genes);
dim(counts)

In [None]:
data.frame(
    x=rowMeans(counts>0)
) %>%
    ggplot( aes(x=x) ) +
    geom_histogram(binwidth=0.05) +
    scale_x_log10() +
    ggtitle("Reads per gene") +
    xlab("Reads per cell") +
    ylab("Number of genes");

In [None]:
# only retain features with counts in 0.1% or more cells
observed = counts[ rowMeans(counts>0) >= 0.001, ];
dim(observed)

In [None]:
# Niters = 100;
# Ncells = ncol(observed);
# Ngenes = nrow(observed);
# Nreads = colSums(observed);

# # sampling probability = read count / total reads in each cell
# bin1total = colSums(observed[, allcells]);
# pmatrix1 = t(t(observed[, allcells]) / bin1total);

# # average across cells
# pvector1 = rowMeans(pmatrix1);

## GxG Correlation:

In [None]:
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsx = as.matrix(t(observed))
dim(obsx)
obsx[1:5,1:5]

In [None]:
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix = function(cormat, pmat) {
  ut = upper.tri(cormat)
  data.frame(
    geneA = rownames(cormat)[row(cormat)[ut]],
    geneB = rownames(cormat)[col(cormat)[ut]],
    corr = (cormat)[ut],
    pVal = pmat[ut]
    )
}

#### Pearson correlation on non-binarized data:

In [None]:
# use hmisc package to calculate correlation and p-value:
corrP = rcorr(obsx, type = "pearson")

corrPflat = flattenCorrMatrix( corrP$r, corrP$P ) %>%
    mutate( pAdj = p.adjust(pVal, method = "bonferroni") ) %>%
    mutate( corrRank = rank(corr) ) %>%
    mutate( pValRank = rank(pVal) );
dim(corrPflat)
fwrite(corrPflat, file="../data/scGROv2p8_GxG_max10kbp_pearson_correlation.csv.gz")

In [None]:
corrPlot = corrPflat %>%
    ggplot(aes(x = corrRank, y = corr)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    ylim(-0.1, 0.3) +
    scale_color_viridis() +
    ggtitle("GxG Pearson Corr") +
    xlab("Rank") +
    ylab("Pearson corr. coeff.");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_pearson_corrCoeff_rank.png", width=4, height=4);

corrPlot = corrPflat %>%
    ggplot(aes(x = pValRank, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("GxG Pearson Corr") +
    xlab("Rank") +
    ylab("Pearson corr. p-value");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_pearson_corrPvalue_rank.png", width=4, height=4);

corrPlot = corrPflat %>%
    ggplot(aes( x = corr, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("GxG Pearson Corr") +
    xlab("Pearson corr. coeff.") +
    ylab("Pearson corr. p-value") ;
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_pearson_corrCoeff_vs_pvalue.png", width=4, height=4)

In [None]:
# corrPflat = fread("../data/scGROv2p8_GxG_max10kbp_pearson_correlation.csv.gz");

corrPearson = corrPflat %>%
    filter( corr >= 0.15 & corr < 1 & pAdj <= 0.05);
nrow(corrPearson)  
head(corrPearson)

test = corrPflat %>%
    filter( corr >= 0.125 & corr < 1 & pAdj <= 0.05);
nrow(test)  
head(test)

#### Spearman correlation on non-binarized data:

In [None]:
# use hmisc package to calculate correlation and p-value:
corrS = rcorr(obsx, type = "spearman")

corrSflat = flattenCorrMatrix( corrS$r, corrS$P ) %>%
    mutate( pAdj = p.adjust(pVal, method = "bonferroni") ) %>%
    mutate( corrRank = rank(corr) ) %>%
    mutate( pValRank = rank(pVal) );
dim(corrSflat)
fwrite(corrSflat, file="../data/scGROv2p8_GxG_max10kbp_spearman_correlation.csv.gz")

In [None]:
corrPlot = corrSflat %>%
    ggplot(aes(x = corrRank, y = corr)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    ylim(-0.1, 0.3) +
    scale_color_viridis() +
    ggtitle("GxG spearman Corr") +
    xlab("Rank") +
    ylab("Spearman corr. coeff.");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_spearman_corrCoeff_rank.png", width=4, height=4);

corrPlot = corrSflat %>%
    ggplot(aes(x = pValRank, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("GxG spearman Corr") +
    xlab("Rank") +
    ylab("Spearman corr. p-value");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_spearman_corrPvalue_rank.png", width=4, height=4);

corrPlot = corrSflat %>%
    ggplot(aes( x = corr, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("GxG spearman Corr") +
    xlab("Spearman corr. coeff.") +
    ylab("Spearman corr. p-value") ;
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_spearman_corrCoeff_vs_pvalue.png", width=4, height=4)

In [None]:
# corrSflat = fread("../data/scGROv2p8_GxG_max10kbp_spearman_correlation.csv.gz");

corrSpearman = corrSflat %>%
    filter( corr >= 0.15 & corr < 1 & pAdj <= 0.05);
nrow(corrSpearman)  
head(corrSpearman)

test = corrSflat %>%
    filter( corr >= 0.125 & corr < 1 & pAdj <= 0.05);
nrow(test)  
head(test)

#### Pearson/Spearman on binary matrix
##### Pearson and Spearman on binary data gives same result

In [None]:
# Binarize observed counts
obsb = observed;
obsb[obsb>1] = 1;
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsb = as.matrix(t(obsb))
dim(obsb)
obsb[1:5,1:5]

In [None]:
# use hmisc package to calculate correlation and p-value:
corrB = rcorr(obsb, type = "pearson")

corrBflat = flattenCorrMatrix( corrB$r, corrB$P ) %>%
    mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
    mutate( corrRank = rank(corr) ) %>%
    mutate( pValRank = rank(pVal) );
dim(corrBflat)
fwrite(corrBflat, file="../data/scGROv2p8_GxG_max10kbp_binary_correlation.csv.gz")

In [None]:
corrBinary = corrBflat %>%
    filter( corr >= 0.15 & corr < 1 & pAdj <= 0.05 );
nrow(corrBinary)  
head(corrBinary)

test = corrBflat %>%
    filter( corr >= 0.125 & corr < 1 & pAdj <= 0.05 );
nrow(test)  
head(test)

In [None]:
corrPlot = corrBflat %>%
    ggplot(aes(x = corrRank, y = corr)) +
    geom_point(pch = 16, aes(color = pAdj)) +
    ylim(-0.1, 0.3) +
    scale_color_viridis() +
    ggtitle("GxG Binary Corr") +
    xlab("Rank") +
    ylab("Binary corr. coeff.");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_rank.png", width=4, height=4);

corrPlot = corrBflat %>%
    ggplot(aes(x = pValRank, y = pVal)) +
    geom_point(pch = 16, aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("GxG Binary Corr") +
    xlab("Rank") +
    ylab("Binary corr. p-value");
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrPvalue_rank.png", width=4, height=4);

corrPlot = corrBflat %>%
    ggplot(aes( x = corr, y = pVal)) +
    geom_point(pch = 16, aes(color = pAdj)) +
    scale_color_viridis() +
    # ggtitle("GxG Binary Corr") +
    theme(legend.position = c(0.875, 0.5),   legend.key.size = unit(5, 'mm'), 
          legend.title = element_text(size=14), legend.background = element_blank()) +
    xlab("Pearson correlation coefficient") +
    ylab("Pearson correlation p-value") ;
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue.png", width=4, height=4)

In [None]:
corrBflat = fread("../data/scGROv2p8_GxG_max10kbp_binary_correlation.csv.gz");
dim(corrBflat)

# corrPlot = corrBflat %>%
#     ggplot(aes( x = corr, y = pVal)) +
#     ggrastr::rasterise(geom_point(pch = ".", aes(color = pAdj))) +
#     scale_color_viridis(option = "H", direction = -1) +
#     # ggtitle("GxG Binary Corr") +
#     theme(legend.position = c(0.875, 0.5),   legend.key.size = unit(4, 'mm'), 
#           legend.title = element_text(size=14), legend.background = element_blank()) +
#     xlab("Pearson correlation coefficient") +
#     ylab("Pearson correlation \n p-value") ;
# ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue_lowHeight_raster.pdf", width=4, height=2)

corrPlot = corrBflat %>%
    ggplot(aes( x = corr, y = pVal)) +
    geom_point(pch = 16, aes(color = pAdj)) +
    scale_color_viridis(option = "E", direction = -1) +
    # ggtitle("GxG Binary Corr") +
    theme(legend.position = c(0.875, 0.5),   legend.key.size = unit(2, 'mm'), 
          legend.title = element_text(size=12), legend.background = element_blank()) +
    xlab("Correlation coefficient") +
    ylab("Correlation \n p-value") ;
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue.tiff", width=4, height=4)
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue_lowHeight.tiff", width=4, height=2)
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue_lowHeight.png", width=4, height=2)

In [None]:
ggsave(corrPlot, filename = "../plots/scGRO_GxG_max10kbp_binary_corrCoeff_vs_pvalue_midHeight.tiff", width=4, height=3)

#### Read Chi-square GxG correlation

In [None]:
corrChiSquare = fread(file="../data/scGROv2p8_mapq3qc_max10kbp_1Kpermuted_chisq_empp.csv.gz");
dim(corrChiSquare)
corrChiSquare[1:4, ]

#### Merge the 4 GxG tests: three type of correlations and a chi-square

In [None]:
# corrPflat = fread(file="../data/scGROv2p8_GxG_max10kbp_pearson_correlation.csv.gz")
# dim(corrPflat)
# corrSflat = fread(file="../data/scGROv2p8_GxG_max10kbp_spearman_correlation.csv.gz")
# dim(corrSflat)
# corrBflat = fread(file="../data/scGROv2p8_GxG_max10kbp_binary_correlation.csv.gz")
# dim(corrBflat)

In [None]:
# join two tables using two columns
corr = left_join(   corrPflat[ ,c(1:3,5)], corrSflat[ ,c(1:3,5)], by=c("geneA", "geneB"), suffix = c(".P", ".S") ) %>%
       left_join(., corrBflat[ ,c(1:3,5)], by=c("geneA", "geneB") ) %>%
       left_join(., corrChiSquare, by=c("geneA", "geneB") ) #%>%
    # na.omit();
dim(corr)
corr[1:4, ]

# fwrite(corr, file="../data/scGROv2p8_GxG_max10kbp_ALL_correlation.csv.gz")

## Read the merged file:

In [None]:
corr = fread("../data/scGROv2p8_GxG_max10kbp_ALL_correlation.csv.gz");
dim(corr)

In [None]:
corr[1:5, ]

test = corr %>%
    filter( emp_p_pos == "NA" & corr != "NA");
    # na.omit();
dim(test)
test[1:10, ]

### co-transcribed genes in same chromosomes:

In [None]:
chr_corr = corr %>%
    filter(as.character(seqnames(features[geneA])) == as.character(seqnames(features[geneB])));
dim(chr_corr)
# lookup gene attributes
geneB = features[ chr_corr$geneA ];
geneA = features[ chr_corr$geneB ];
chr_corr$lengthA = width(geneA);
chr_corr$lengthB = width(geneB);
chr_corr$strA    = as.character(strand(geneA));
chr_corr$strB    = as.character(strand(geneB));

# compute distance between gene promoters
proA = resize(geneA, width=1, fix="start");
proB = resize(geneB, width=1, fix="start");
strand(proA) = "*";
strand(proB) = "*";
chr_corr$distance = width(pgap( proA, proB ));

# chr_corr = chr_corr %>% 
#     filter(distance < 10e6);
nrow(chr_corr)
chr_corr[1:4, ]

In [None]:
chr_corr %>%
    filter( distance <= 2500000 ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins = 50, color="white", fill="#445577") +
    # xlim(0, 2500000) +
    # scale_x_log10() +
    ggtitle("All genes on either strand") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");
ggsave(filename="../plots/GxG_distance_between_allGenes.pdf", width=4, height=4, units="in")

chr_corr %>%
    filter( corr > 0.1 & pAdj < 0.05 ) %>%
    # filter( corr > 0.1 & pAdj < 0.05 & emp_p_pos < 0.05 ) %>%
    filter( distance <= 2500000 ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins = 50, color="white", fill="#445577") +
    # xlim(0, 2500000) +
    # scale_x_log10() +
    ggtitle("Correlated genes on either strand") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");
ggsave(filename="../plots/GxG_distance_between_corrGenes.pdf", width=4, height=4, units="in")


chr_corr %>%
    filter( corr > 0.1 & pAdj < 0.05 ) %>%
    # filter( corr > 0.1 & pAdj < 0.05 & emp_p_pos < 0.05 ) %>%
    filter( distance <= 2500000 ) %>%
    filter( strA != strB ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins = 50, color="white", fill="#445577") +
    xlim(0, 2500000) +
    xlab("Distance between genes in different strand (kb)") +
    ylab("Density of gene pairs");
ggsave(filename="../plots/GxG_distance_between_corrGenes_diffStrands.pdf", width=4, height=4, units="in")

chr_corr %>%
    filter( corr > 0.1 & pAdj < 0.05 ) %>%
    # filter( corr > 0.1 & pAdj < 0.05 & emp_p_pos < 0.05 ) %>%
    filter( distance <= 2500000 ) %>%
    filter( strA == strB ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins = 50, color="white", fill="#445577") +    
    xlim(0, 2500000) +
    xlab("Distance between genes in same strand (kb)") +
    ylab("Number of gene pairs");
ggsave(filename="../plots/GxG_distance_between_corrGenes_sameStrands.pdf", width=4, height=4, units="in")

In [None]:
chr_corrF = chr_corr %>%
    filter( distance <= 2500000 ) %>%
    arrange( geneA );
dim(chr_corrF)
summary(chr_corrF$distance)
# make a new column class to plot all vs significant as overlapping histogram:
chr_corrF$class = ifelse(chr_corrF$corr >= 0.075 & chr_corrF$pAdj <= 0.05 & chr_corrF$emp_p_pos <= 0.05, "correlated pairs", "uncorrelated pairs")
summary(as.factor(chr_corrF$class))
chr_corrF[1:4, ]

In [None]:
chr_corrF %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", 
                   alpha = 0.5, 
                   # color = "white",
                   # mapping = aes(y = after_stat(count))) +
                   # mapping = aes(y = after_stat(ncount))) +
                   # mapping = aes(y = after_stat(count/sum(count)))) +
                   mapping = aes(y = after_stat(density))) +
    # geom_density() +
    # scale_x_continuous(limits = c(0, 2500000)) +
    # scale_x_continuous(breaks = seq(0, 2500000, 100000)) +
    # stat_bin(aes(y=..count.., label=..count..), geom="text", vjust=2) +
    xlim(0, 2500000) +
    theme(legend.position = c(0.8, 0.9), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.5, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab("Distance between gene pairs (kb)") +
    ylab("Normalized gene pairs")
ggsave(filename="../plots/GxG_distance_corr-uncorr.pdf", width=6, height=6, units="in")

chr_corrF %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity") +
    xlim(0, 2500000) +
    theme(legend.position = "none") +
    guides(fill=guide_legend(title="")) +

    facet_wrap(~class, scales = "free") +
    scale_fill_manual(values=c("#1F968B", "darkgray")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Distance between gene pairs (kb)") +
    ylab("Normalized gene pairs")
ggsave(filename="../plots/GxG_distance_corr-uncorr_facets.pdf", width=8, height=6, units="in")

ks.test(distance ~ class, chr_corrF);

In [None]:
chr_corrF %>%
    filter( strA != strB ) %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", alpha = 0.5, 
                   mapping = aes(y = after_stat(density))) +
    xlim(0, 2500000) +
    theme(legend.position = c(0.8, 0.9), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.5, 'cm')) +
    guides(fill=guide_legend(title="")) +
    xlab("Distance between genes in different strand (kb)") +
    ylab("Density of gene pairs");
ggsave(filename="../plots/GxG_distance_between_corr-uncorr_Genes_diffStrands.pdf", width=4, height=4, units="in")
ks.test(distance ~ class, chr_corrF);

chr_corrF %>%
    filter( strA == strB ) %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", alpha = 0.5, 
                   mapping = aes(y = after_stat(density))) +    
    xlim(0, 2500000) +
    theme(legend.position = c(0.8, 0.9), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.5, 'cm')) +
    guides(fill=guide_legend(title="")) +
    xlab("Distance between genes in same strand (kb)") +
    ylab("Density of gene pairs");
ggsave(filename="../plots/GxG_distance_between_corr-uncorr_Genes_sameStrands.pdf", width=4, height=4, units="in")
ks.test(distance ~ class, chr_corrF);

In [None]:
chr_corr %>%
    filter( strA != strB ) %>%
    filter( distance <= 2500000 ) %>%
    filter( corr > 0.1 & pAdj < 0.05 ) %>%
    ggplot( aes(x=distance, y=-log10(pAdj)) ) +
    geom_pointdensity(size=0.5) +
    xlim(0, 2500000) +
    scale_color_viridis() +
    ggtitle("Gene pairs on different strands") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( strA == strB ) %>%
    filter( distance <= 2500000 ) %>%
    filter( corr > 0.1 & pAdj < 0.05 ) %>%
    ggplot( aes(x=distance, y=-log10(pAdj)) ) +
    geom_pointdensity(size=0.5) +
    xlim(0, 2500000) +
    scale_color_viridis() +
    ggtitle("Gene pairs on same strands") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

### Dendogram & dynamic tree cut

In [None]:
# corrP$r[1:4,1:4]
# dim(corrP$r)

### Gene modules

In [None]:
corr = fread("../data/scGROv2p8_GxG_max10kbp_ALL_correlation.csv.gz");
dim(corr)

In [None]:
# just for the network plot, constrain within same chromoseome:
corrF = corr %>%
    # filter(as.character(seqnames(genes[geneA])) == as.character(seqnames(genes[geneB]))) %>%
    # filter( distance <= 500000 ) %>%
    filter( corr > 0.2 & pAdj < 0.05 ) %>%
    # removing pairs with Gm genes, as these could be overlapping with enhancers:
    filter( substr(geneA, 0, 5) != "GN-Gm" ) %>%
    filter( substr(geneB, 0, 5) != "GN-Gm" ) %>%
    # removing genes with Rik:
    filter( substr(geneA, (nchar(geneA)+1)-3, nchar(geneA)) != "Rik" ) %>%
    filter( substr(geneB, (nchar(geneB)+1)-3, nchar(geneB)) != "Rik" );
dim(corrF)

ggnetR = corrF %>%
    # mutate( Gene = sub("GN-", "", Gene) ) %>%
    graph_from_data_frame(directed = F);

plot(ggnetR, 
     vlayout = layout_on_sphere(ggnetR),
     vertex.shape = 'none',
     vertex.label = NA,
     # vertex.color = "#cee2f4",
     # vertex.size=0.1,
     # vertex.label.family = "Helvetica",
     # vertex.label.font = 1,
     # vertex.label.color = "white",
     # vertex.frame.color = "white",
     ## 80 is 50% transparency
     edge.color = "#46849980",
     edge.arrow.size = .1,  
     edge.width = 0.5
     )

In [None]:
corrF = corr %>%

    ## THE PLOTS IN PAPER WERE MADE WITH CRITERIA ##
    # filter( corr > 0.1 & corr < 1 & pAdj < 0.05 ); # 227,701

    ## TRYING NEW THRESHOLD INCLUDING THE EMP P VALUE ##
    filter( corr > 0.1 & pAdj < 0.05 & emp_p_pos < 0.05); # 137,418

dim(corrF)
corrF[1:4,]

# corr > 0.10 & corr < 1 & pAdj <= 0.05 = 227701
# corr > 0.15 & corr < 1 & pAdj <= 0.05 = 49064
# corr > 0.20 & corr < 1 & pAdj <= 0.05 = 7384

In [None]:
optimize_resolution = function(network, resolution) {
    modules = network %>% 
    cluster_leiden(resolution_parameter = resolution,
                   objective_function = "modularity")

    parsed_modules = data.frame(
        gene_ID = names(membership(modules)),
        module = as.vector(membership(modules)) 
    );

    num_module_5 = table(parsed_modules$module) %>% 
        as.data.frame %>%
        filter(Freq >= 10) %>% 
        nrow();

    num_genes_contained = table(parsed_modules$module) %>% 
        as.data.frame %>%
        filter(Freq >= 10) %>% 
        summarise(sum = sum(Freq)) %>%
        as.numeric()

    return( c(num_module_5, num_genes_contained) );
}

In [None]:
ggnetR = corrF %>%
    mutate( geneA = sub("GN-", "", geneA), geneB = sub("GN-", "", geneB) ) %>%
    graph_from_data_frame(directed = F);
length(ggnetR)
ggnetR

In [None]:
plot(ggnetR, 
     vlayout = layout_on_sphere(ggnetR),
     vertex.shape = 'none',
     vertex.label = NA,
     # vertex.color = "#cee2f4",
     # vertex.size=0.1,
     # vertex.label.family = "Helvetica",
     # vertex.label.font = 1,
     # vertex.label.color = "white",
     # vertex.frame.color = "white",
     ## 80 is 50% transparency
     edge.color = "#46849980",
     edge.arrow.size = .1,  
     edge.width = 0.5
     )

In [None]:
set.seed(3)
g <- barabasi.game(12, m=4, directed=FALSE)
eb <- cluster_edge_betweenness(g)
plot(eb, g, 
     layout=layout_with_fr,
     # vlayout = layout_on_sphere(ggnetR),
     # rescale = FALSE,
     vertex.label = NA,
     # vertex.label.family = "Helvetica",
     # vertex.label.font = 1,
     edge.arrow.size = .1, 
     # edge.color = "gray", 
     edge.width = 3,
     vertex.color = "#cee2f4",
     vertex.size=10, 
     # vertex.label.color = "white",
     vertex.frame.color = "white")
# ggsave(filename="../plots/dummy_GxG_network.pdf", width=16, height=16, units="in")

In [None]:
MG = lapply(unique(eb$membership), function(m) { which(eb$membership == m) })
MG[sapply(MG, length) == 1] = NULL
plot(eb, g, mark.groups=MG, layout=layout_with_fr) 

In [None]:
optimization_resultsR = purrr::map_dfc(
  .x = seq(from = 0.1, to = 20, by = 0.1),
  .f = optimize_resolution, 
  network = ggnetR
) %>%
    t() %>%
    cbind( resolution = seq(from = 0.1, to = 20, by = 0.1) ) %>% 
    as.data.frame()
optimization_resultsR

In [None]:
ggmods0p1 = ggnetR %>%
    cluster_leiden(resolution_parameter = 0.1, objective_function = "modularity");
ggmods1 = ggnetR %>%
    cluster_leiden(resolution_parameter = 1, objective_function = "modularity");
ggmods2 = ggnetR %>%
    cluster_leiden(resolution_parameter = 2, objective_function = "modularity");
ggmods2p5 = ggnetR %>%
    cluster_leiden(resolution_parameter = 2.5, objective_function = "modularity");
ggmods3 = ggnetR %>%
    cluster_leiden(resolution_parameter = 3, objective_function = "modularity");
ggmods5 = ggnetR %>%
    cluster_leiden(resolution_parameter = 5, objective_function = "modularity");

In [None]:
# data.frame(
#     module = unlist(as.vector(membership(ggmods0p1)))
# ) %>%
#     ggplot(aes(x=module)) +
#     geom_histogram(binwidth=1) +
#     scale_y_log10() +
#     xlab("Module number") +
#     ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods1)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods2)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")


data.frame(
    module = unlist(as.vector(membership(ggmods2p5)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")
data.frame(
    module = unlist(as.vector(membership(ggmods3)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods5)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

In [None]:
ggmodUSED = ggmods2

In [None]:
data.frame(
    gene_ID = names(membership(ggmodUSED)),
    module = unlist(as.vector(membership(ggmodUSED)))
) %>%
    group_by(module) %>%
    mutate( mod.size=dplyr::n() ) %>%
    ungroup() %>%
    mutate( gene_ID = sub("GN-", "", gene_ID, fixed=T) ) %>%
    arrange_at("module") %>%
    fwrite("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2.csv");

In [None]:
# only select modules that are bigger than 5 genes:
ggmodUSED = groups(ggmodUSED)[lengths(groups(ggmodUSED)) > 10]
length(ggmodUSED)

In [None]:
suppressMessages({
    entrezids = lapply(ggmodUSED, function(x) {
        x = mapIds(org.Mm.eg.db, x, 'ENTREZID', 'SYMBOL');
        return(unname(x));
    })
});

In [None]:
universe = entrezids %>% unlist %>% unique;

In [None]:
# uni = sub("GN-", "", rownames(observed))
# uni[1:4]
# univ = mapIds(org.Mm.eg.db, uni, 'ENTREZID', 'SYMBOL');
# univ = unname(univ)
# universe = univ %>% unlist %>% unique;
# universe[1:4]

In [None]:
res = compareCluster(
    geneClusters = entrezids, 
    fun = "enrichGO", # ORA function to apply to each cluster
    # Arguments below are passed to enrichGO
    OrgDb = "org.Mm.eg.db",
    keyType = "ENTREZID", 
    ont = "ALL", # BP, CC, MF, or ALL for all ontologies
    pvalueCutoff = 0.01,
    qvalueCutoff = 0.01,
    pAdjustMethod = "BH", # p-values are adjusted within clusters
    universe = universe,
    minGSSize = 5,
    maxGSSize = 1000
    ) %>%
    setReadable(., OrgDb = org.Mm.eg.db, keyType = "ENTREZID");
dim(res)

In [None]:
res %>%
    arrange(Cluster, pvalue) %>%
    head(n=514)

In [None]:
# saveRDS(res, "../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2.rds")

saveRDS(res, "../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2.rds")

In [None]:
res %>%
    arrange(Cluster, pvalue) %>%
    # mutate(geneID=NULL) %>%
    # write.table("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_enrichGO.csv", sep='\t', quote=F, col.names=T, row.names=F)
    write.table("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2_enrichGO.csv", sep='\t', quote=F, col.names=T, row.names=F)

### Read the Cluster Profiler result:

In [None]:
res = readRDS("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2.rds")

# res = readRDS("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2.rds")

In [None]:
res %>%
    arrange(Cluster, pvalue) %>%
    head(n=300)

In [None]:
# select non-redundant GO terms for display:
results = res %>%
    filter( ID %in% c("GO:0003729", "GO:0008380", "GO:1990904", "GO:0006325", "GO:0000375", "GO:0006417", "GO:0016570", "GO:0031570", "GO:0042752", "GO:0003730", "GO:0006403", "GO:1990830", "GO:0016607", "GO:0045944", "GO:0016605", "GO:0007049", "GO:0009048", "GO:0006511", "GO:0031060", "GO:0006413", "GO:0031507", "GO:0022402", "GO:0006376", "GO:0005685", "GO:0016573", "GO:0016605", "GO:1901796", "GO:0045454", "GO:0000075", "GO:0022904", "GO:1901987", "GO:0034063", "GO:0007623", "GO:0005643", "GO:0032200", "GO:0042752", "GO:0006281", "GO:0003743", "GO:0010564", "GO:0010494") ) #%>%
    # mutate(geneID=NULL);
dim(results)
results %>%
    arrange(Cluster, pvalue) %>%
    head( n=100 );

In [None]:
results %>%
    arrange(Cluster, pvalue) %>%
    # mutate(geneID=NULL) %>%
    # fwrite("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_enrichGO.csv")
    write.table("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_enrichGO_selected.csv", sep='\t', quote=F, col.names=T, row.names=F)
    # write.table("../data/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2_enrichGO_selected.csv", sep='\t', quote=F, col.names=T, row.names=F)

In [None]:
dotplot(results,
        x = "Cluster",
        color = "p.adjust",
        showCategory = 50,
        split = NULL,
        font.size = 12,
        title = "",
        by = "geneRatio",
        size = NULL,
        includeAll = TRUE,
        label_format = 30
)
ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_new_enrichGO.pdf", width=12, height=12, units="in")
# ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_withEmpPpos_res2_new_enrichGO.pdf", width=12, height=12, units="in")

In [None]:
bplot = results %>%
    arrange(Cluster, pvalue) %>%
    data.frame() %>%
    # separate values in a column by "/" and store the 1st and 2nd object in "Overlap", "Total":
    separate( GeneRatio, c("Overlap", "Total"), "/") %>%
    mutate( GeneRatio = as.numeric(Overlap) / as.numeric(Total) ) %>%
    ggplot( aes(x = GeneRatio, y = fct_inorder(Description), fill=qvalue)) + 
    geom_bar( stat = 'identity' ) +
    ylab(NULL) +
    scale_fill_viridis()
bplot
ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_barplot.pdf", width=10, height=6, units="in")

In [None]:
eplot = pairwise_termsim(results)
emapplot(eplot,
        showCategory = 35,
        pie.params = list(pie = "equal", legend_n = 5),
        layout.params = list(layout = NULL, coords = NULL),
        edge.params = list(show = TRUE, min = 0.1),
        cluster.params = list(cluster = FALSE, method = stats::kmeans, n = NULL, legend = FALSE, label_style = "shadowtext", label_words_n = 4, label_format = 30),
        cex.params = list(category_node = 1, category_label = 1, line = 1, pie2axis = 1, label_group = 1),
        hilight.params = list(category = NULL, alpha_hilight = 1, alpha_no_hilight = 0.3)
        )
ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_emapplot.pdf", width=6, height=6, units="in")

In [None]:
# to remove redundant terms:
simplifiedResults = simplify(res, cutoff=0.6, by="p.adjust", select_fun=min)
eplotS = pairwise_termsim(simplifiedResults)
emapplot(eplotS, showCategory = 20)
# ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_emapplot_simplified.pdf", width=6, height=6, units="in")

In [None]:
cplot = cnetplot(results,
                 showCategory = 15,
                 # foldChange = NULL,
                 layout = "kk",
                 colorEdge = FALSE,
                 circular = FALSE,
                 node_label = "all",
                 # cex_category = 1,
                 # cex_gene = 1,
                 # cex_label_category = 1,
                 # cex_label_gene = 1,
                 color_category = "#E5C494",
                 color_gene = "#B3B3B3",
                 shadowtext = "all"
                 # color.params = list(foldChange = NULL, edge = FALSE, category = "#E5C494", gene ="#B3B3B3"),
                 # cex.params = list(category_node = 1, gene_node = 1, category_label = 1, gene_label = 1),
                 # hilight.params = list(category = NULL, alpha_hilight = 1, alpha_no_hilight = 0.3)
)
ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_cnetplot.pdf", width=12, height=12, units="in")

### network diagram using igraph:

In [None]:
layouts = grep("^layout_",ls("package:igraph"),value=TRUE)[-1]
layouts

In [None]:
# to make network plot using the gens from GOterm
networkPlot = function(corrMatrix, GOresult, GOterm ){
    genesList = getGOgenes(GOresult, GOterm);
    net = corrMatrix %>%
        # filter( corr > 0.14 & pAdj <= 0.05 ) %>%
            filter( corr > 0.1 & pAdj <= 0.05 ) %>%
        mutate( geneA = sub("GN-", "", geneA), geneB = sub("GN-", "", geneB) ) %>%
        filter( geneA %in% rownames(genesList) & geneB %in% rownames(genesList) );
    
    net = graph_from_data_frame(net, directed = F) 

    # conditional color based on pAdj:
    # E(net)$color = ifelse(E(net)$pAdj <= 0.05 & E(net)$corr >= 0.1, "#ce968b", "gray")
    # conditional on corr value:
    E(net)$color = ifelse(E(net)$corr > 0.15, "tomato", "gray")

    netPlot = plot(net, 
                   vlayout = layout_on_sphere(net),
                   vertex.label.family = "Helvetica",
                   vertex.label.font = 2,
                   edge.arrow.size = .1, 
                   # edge.color = "gray", 
                   edge.width = 3,
                   vertex.color = "#cee2f4",
                   vertex.frame.color = "white", 
                   vertex.label.color = "black")
    
    return(netPlot);
    
    # ggsave(filename=sprintf("../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_%s%_network.svg", GOterm), width=12, height=12, units="in")
}

In [None]:
# USE:
# filter( corr > 0.14 & pAdj <= 0.05 ) %>%

cell_cycle = networkPlot(corrF, results, "GO:0007049");
# ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_0007049_network.svg", width=12, height=12, units="in")

# SAVE DOES NOT WORK. SAVE MANUALLY AS SVG BY RIGHT CLICKING.

In [None]:
cc_phaseTransition = networkPlot(corr, results, "GO:1901987");
# ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_1901987_network.pdf", width=12, height=12, units="in")

In [None]:
DNA_repair = networkPlot(corr, results, "GO:0006281");
# ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_0006281_network.pdf", width=12, height=12, units="in")

In [None]:
splicing = networkPlot(corrF, results, "GO:0008380") #%>%
    # ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_0008380_network.pdf", width=12, height=12, units="in")

In [None]:
circadian = networkPlot(corr, results, "GO:0042752") #%>%
    # ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_0042752_network.pdf", width=12, height=12, units="in")

In [None]:
LIF = networkPlot(corr, results, "GO:1990830") #%>%
    # ggsave(filename="../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_1990830_network.pdf", width=12, height=12, units="in")

### make bed file of genes in the GO class for motif discovery with HOMER

In [None]:
# get bed files for genes in GOterm ONLY for genes if the co-expressed partner is also in the GOterm:
bedGOgenes = function( corrMatrix, GOresult, GOterm ){
    genesList = getGOgenes(GOresult, GOterm);
    net = corrMatrix %>%
        # Even if the corr instead of corrF matrix is used, it ensures that we consider relatively strongly co-Ex genes
        filter( corr >= 0.1 & pAdj <= 0.05 ) %>%
        filter( sub("GN-", "", geneA) %in% rownames(genesList) & sub("GN-", "", geneB) %in% rownames(genesList) );
        
    selGeneIDs = features %>%
        mutate( name = sub("GN-", "", names)) %>%
        filter( names %in% c(net$geneA, net$geneB)) %>%
        promoters( upstream = 1000, downstream = 500 ) %>%
        data.frame();
    
    return(selGeneIDs);
}

In [None]:
# make a directory
# write bed files of gene promoters in each GO IDs:
dir.name = "scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_promoters"
dir.create(paste0("../data/", dir.name));

suppressMessages({
    foreach(
        id = data.frame(results)$ID
    ) %do% {
        # use corr instead of corrF. function filters orr >= 0.1 & pAdj <= 0.05
        GO_promoters = bedGOgenes(corr, results, id);
        export.bed(GO_promoters, con=paste0( "../data/", dir.name, "/", sub(":", "_", id), ".bed"));
        return();
    }
});

### Ideogram

In [None]:
mm10_karyotype = fread("../data/mm10_karyotype.csv");
mm10_gene_density = fread("../data/mm10_gene_density.csv");

In [None]:
GO_0006281	 = getGOgenes(results, "GO:0006281");
# GO_0000075[1:4, ]
GO_0006281

In [None]:
ideogram(karyotype = mm10_karyotype, 
         overlaid = mm10_gene_density, 
         label = GO_0006281, 
         label_type = "marker", 
         output = "../plots/Ideogram_GO_0006281_scGROv2p8_GxGmodules_max10kbp_binary_corr_res.svg")
# svg2pdf("../plots/Ideogram_ggmodules_3xtimebins10kbp_ChrConstrain_res4.svg")