In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
# .libPaths(c("/home/mahat/.conda/envs/r422/lib/R/library",
#             "/net/bmc-lab2/data/lab/scgroseq/group/software/R/x86_64-pc-linux-gnu-library/4.2"))
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(dplyr)
    library(matrixStats)
    library(foreach)
    library(doParallel)
    library(plyranges)
    library(viridis)
    library(ggpointdensity)
    library(Matrix)
    library(data.table)
    library(Hmisc)
    library(gplots)
    library(rstatix)
    library(scattermore)
    library(igraph)
    library(ggraph)
    library(clusterProfiler)
    library(rtracklayer)
    library(stringr)
    library(org.Mm.eg.db)
    library(circlize)
    library(gplots)
    library(RIdeogram)
    library(enrichplot)
    library(GOSemSim)
    library(dynamicTreeCut)
    library(WGCNA)
});
source("./scGRO_functions.r");

In [None]:
registerDoParallel(16);
options(
    repr.plot.width=8,
    repr.plot.height=6,
    jupyter.plot_mimetypes = "image/svg+xml",
    digits=5
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");

# load features
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
# features$name=NULL;
features$score=NULL;
features

In [None]:
# genes shorter than 1500 are: 2931
# genes shorter than 1000 are: 2209
# genes shorter than  500 are:  484
# genes shorter than  200 are:  147

# short Gm### genes
shortGm = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    filter( width > 200 ) %>%
    filter( width < 1000 & substr(names, 0, 5) == "GN-Gm" );
length(shortGm)
summary(width(shortGm))
names(shortGm)[1:5]

# define highly expressed "blacklist" genes to be
# removed from correlations
blacklist = grep( "GN-Rp[ls]" , names(features) );
blacklist = grep( "GN-Sn[oh]", names(features) ) %>%
    append(blacklist);

blacklist = names(features)[ blacklist ];
length(blacklist)
summary(width(features[ blacklist ]))
blacklist[c(1:5, 110:115)]

In [None]:
# filter genes
genes = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    # remove genes less than 1500 nt
    filter( width >= 1500 ) %>%
    # filter genes with dREG peaks
    filter( names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
    # filter highly expressed genes
    filter(!names %in% blacklist ) %>%
    # filter Gm genes shorter than 1 kb
    filter(!names %in% names(shortGm) ) %>%
    # remove 500 nt at the 5' end
    anchor_3p() %>%
    mutate( width = width - 500 );
summary(width(genes))
genes

In [None]:
enhancers = features %>%
    filter( substr(names, 0, 3) != "GN-" ) %>%
    filter( width >= 3000 );
summary(width(enhancers))
table(substr(enhancers$name, 0, 3));
enhancers

In [None]:
# load groHMM-extended genes and enhancers
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
colnames(dREG) = c("chr", "start", "end", "score", "pval", "center");
# use the "center" column as start
# colnames(dREG) = c("chr", "leftEnd", "rightEnd", "score", "pval", "start");
# dREG$end = dREG$start;
dREG = GRanges(dREG);
summary(dREG$score)
 # dREG = dREG %>%
 #    # removing dREG peaks with score lower than mean
 #    filter( score > median(score) );

# mark dREG peaks by the overlapping enhancers (from freatures):
hits = findOverlaps( dREG, enhancers, ignore.strand=T );
dREG$assignedEnhancer = NA;
# assign ovelapping SE to dREG peaks
dREG$assignedEnhancer[hits@from] = names(enhancers)[hits@to];

# filter dREG peak to only leave one with highest score:
dREGfilt = dREG %>%
    # filter dREG peaks that do not overlap with an enhancer
    na.omit() %>%
    group_by( assignedEnhancer ) %>%
    filter( score == max(score) ) %>%
    ungroup() %>%
    # make data table for left_join
    as.data.table;
dim(dREGfilt)
table(substr(dREGfilt$assignedEnhancer, 0, 3));
length(unique(dREGfilt$assignedEnhancer))
dREGfilt[1:4, ]

In [None]:
Ecenter = left_join( as.data.table(enhancers), dREGfilt[,c(6:9)], by=c("name"="assignedEnhancer") ) %>%
    na.omit() #%>%
    # GRanges()
# names(Ecenter) = Ecenter$name;
Ecenter$pval = NULL;
Ecenter$score = NULL;
Ecenter[1:4, ]

In [None]:
# ## join SE and dREG with dREG center:
# # convert center to start and end
# Ecenter = left_join( as.data.table(enhancers)[,c(1,6)], dREGfilt[,c(6:9)], by=c("name"="assignedEnhancer") ) %>%
#       # some NAs come from missing entries
#       na.omit() %>%
#       mutate( start = center ) %>%
#       mutate( end = center ) %>%
#       GRanges()# %>%
# names(Ecenter) = Ecenter$name;
# # SEcenter$name = NULL;
# Ecenter$center = NULL;

# # create 1500 nt bin of Enhancer in sense direction
# Edown = resize(Ecenter, width = 1500, fix="start") %>%
#     mutate( strand = "+" ) %>%
#     subsetByOverlaps( genes, invert = TRUE );
# length(Edown)
# # create 1500 nt bin of Enhancer in anti-sense direction
# Eup = resize(Ecenter, width = 1500, fix="end") %>%
#     mutate( strand = "-" ) %>%
#     subsetByOverlaps( genes, invert = TRUE );
# length(Eup)

In [None]:
# create Enhancer bin in sense direction
Edown = Ecenter %>%
    mutate( strand = "+" ) %>%
    #convert center to start
    mutate( start = center ) %>%
    # make sure the end is at least 1750 For > 1750, keep as it is
    mutate( end = ifelse(end - center > 1750, end, center + 1750) ) %>%
    GRanges() %>%
    anchor_3p() %>%
    mutate( width = width - 250 )
    # # no need for this beacuse the genes will be filtered against enhancers:
    # subsetByOverlaps( genes, invert = TRUE );
names(Edown) = Edown$name;
Edown$center = NULL;
Edown
# create Enhancer bin in anti-sense direction
Eup = Ecenter %>%
    mutate( strand = "-" ) %>%
    mutate( end = center) %>%
    mutate( start = ifelse(center - start > 1750, start, center - 1750) ) %>%
    GRanges() %>%
    anchor_3p() %>%
    mutate( width = width - 250 );
names(Eup) = Eup$name;
Eup$center = NULL;
Eup

In [None]:
# combine features:
expressedFeatures = c(Eup, Edown, genes) %>%
     sort();
summary(width(expressedFeatures))
expressedFeatures

# trim the long genes to 10 kb
longf = which( width(expressedFeatures) >= 10000 );
expressedFeatures[longf] = expressedFeatures[longf] %>%
    resize( width = 10000, fix="start" );
summary(width(expressedFeatures))
expressedFeatures

# writing the file to check in IGV:
write_bed(expressedFeatures, file="../data/ExpressedFeatures_4c_ExG_All_Corr.bed")

In [None]:
# filter out features that overlap:
hits = findOverlaps(expressedFeatures)
expressedFeatures$overlappingFeature = NA;
# assign ovelapping feature
expressedFeatures$overlappingFeature[hits@from] = names(expressedFeatures)[hits@to];

test = expressedFeatures %>%
    na.omit() %>%
    group_by( overlappingFeature ) %>%
    filter( n_distinct(name) == 1 ) %>%
    ungroup();
test

In [None]:
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS("../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds");
#counts = counts[,colSums(counts) >= 1000 ];
dim(counts)
allcells = colnames(counts);
table(substr(allcells, 0, 6));

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( mapq >= 3 & countQC & umiQC & plateQC & cellQC & miRQC ) %>%
    subsetByOverlaps( expressedFeatures ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    filter( cellID %in% allcells ) #%>%
    # select( cellID );
scGRO

In [None]:
counts = foreach(
    id = allcells,
    .combine="cbind2"
) %dopar% {
    reads = scGRO %>% filter(cellID == id);
    # count reads in each feature for this cellID
    counts1 = countOverlaps( expressedFeatures, reads );

    counts = cbind(counts1) %>%
        Matrix(sparse=T);

    return( counts );
}
# columns = cells = cellIDs
colnames(counts) = allcells;
rownames(counts) = names(expressedFeatures);
dim(counts)

In [None]:
# to sum rows with same rownames
# the sense and anti-sense bins of enhancers have same names
counts = rowsum(counts, row.names(counts))
dim(counts)
# rowSums(counts)
# colSums(counts)

In [None]:
data.frame(
    x=rowMeans(counts>0)
) %>%
    ggplot( aes(x=x) ) +
    geom_histogram(binwidth=0.05) +
    scale_x_log10() +
    ggtitle("Reads per Feature") +
    xlab("Reads per cell") +
    ylab("Number of features");

In [None]:
# only retain features with counts in 0.1% or more cells
observed = counts[ rowMeans(counts>0) >= 0.001, ];
dim(observed)

test = substr(rownames(observed), 0, 3)
summary(as.factor(test))

## ExG Correlation:

In [None]:
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsx = as.matrix(t(observed))
dim(obsx)
obsx[1:5,1:5]

In [None]:
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix = function(cormat, pmat) {
  ut = upper.tri(cormat)
  data.frame(
    Gene = rownames(cormat)[row(cormat)[ut]],
    Enhancer = rownames(cormat)[col(cormat)[ut]],
    corr = (cormat)[ut],
    pVal = pmat[ut]
    )
}

#### Pearson correlation on non-binarized data:

In [None]:
# # use hmisc package to calculate correlation and p-value:
# corrP = rcorr(obsx, type = "pearson")

#### Spearman correlation on non-binarized data:

In [None]:
# # use hmisc package to calculate correlation and p-value:
# corrS = rcorr(obsx, type = "spearman")

#### Pearson/Spearman on binary matrix
##### Pearson and Spearman on binary data gives same result

In [None]:
# Binarize observed counts
obsb = observed;
obsb[obsb>1] = 1;
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsb = as.matrix(t(obsb))
dim(obsb)
obsb[1:5,1:5]

In [None]:
# use hmisc package to calculate correlation and p-value:
corrB = rcorr(obsb, type = "pearson")

In [None]:
corrBflat = flattenCorrMatrix( corrB$r, corrB$P ) %>%
    mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
    mutate( corrRank = rank(corr) ) %>%
    mutate( pValRank = rank(pVal) );
dim(corrBflat)

# save file:
fwrite(corrBflat, file="../data/scGROv2p8_ExG_max10kbp_ALL_correlation_0p001Exp.csv.gz")

In [None]:
corrPlot = corrBflat %>%
    ggplot(aes(x = corrRank, y = corr)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    ylim(-0.1, 0.3) +
    scale_color_viridis() +
    ggtitle("ExG Binary Corr") +
    xlab("Rank") +
    ylab("Binary corr. coeff.");
ggsave(corrPlot, filename = "../plots/scGRO_ExG_max10kbp_0p001Exp_binary_corrCoeff_rank.png", width=4, height=4);

corrPlot = corrBflat %>%
    ggplot(aes(x = pValRank, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("ExG Binary Corr") +
    xlab("Rank") +
    ylab("Binary corr. p-value");
ggsave(corrPlot, filename = "../plots/scGRO_ExG_max10kbp_0p001Exp_binary_corrPvalue_rank.png", width=4, height=4);

corrPlot = corrBflat %>%
    ggplot(aes( x = corr, y = pVal)) +
    geom_point(pch = '.', aes(color = pAdj)) +
    scale_color_viridis() +
    ggtitle("ExG Binary Corr") +
    xlab("Binary corr. coeff.") +
    ylab("Binary corr. p-value") ;
ggsave(corrPlot, filename = "../plots/scGRO_ExG_max10kbp_0p001Exp_binary_corrCoeff_vs_pvalue.png", width=4, height=4)

In [None]:
# flatten the corr matrixes:
# corrPflatRaw = flattenCorrMatrix( corrP$r, corrP$P )
# corrSflatRaw = flattenCorrMatrix( corrS$r, corrS$P )
# corrBflatRaw = flattenCorrMatrix( corrB$r, corrB$P )
# dim(corrBflatRaw)

# # join correlation and chi-square stat files by genes and enhancers
# corr = left_join(   corrPflatRaw, corrSflatRaw, by=c("Gene", "Enhancer"), suffix = c(".P", ".S") ) %>%
#           left_join(., corrBflatRaw, by=c("Gene", "Enhancer") ) %>%
#           left_join(., corrChiSquare, by=c("Gene", "Enhancer") );

In [None]:
# filter for enhancers (enhancer names with chr are mostly in 1st col and genes are in 2nd column, alphabetically)
corr_ENH = corrBflat %>%
            filter( substr(Gene, 0, 3) == "chr" & substr(Enhancer, 0, 3) == "GN-" ) %>%
            # switch the columns to the Gene - Enhancer order:
            relocate( Gene, .after = Enhancer ) %>%
            rename( Enhancer = Gene, Gene = Enhancer );
dim(corr_ENH)
# filter for enhancers (SEs and other custom enhancers with INT_ and Sox, Nanog, Oct names are mostly in 2nd col and genes are in 1st column, alphabetically)
corr_INT = corrBflat %>%
          filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) != "GN-" );
dim(corr_INT)
# combine ENH and INT files:
corrRaw = data.table(rbind(corr_ENH, corr_INT)) %>%
          # filter(as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer]))) %>%
          mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
          # na.omit();
          arrange( Gene );
dim(corrRaw)
corrRaw[1:6, ]
dim(corrRaw)
table(substr(corrRaw$Gene, 0, 3));
table(substr(corrRaw$Enhancer, 0, 3));
corrRaw[1:4, ]

# lookup gene attributes
Gene = expressedFeatures[ corrRaw$Gene ];
Enhancer = expressedFeatures[ corrRaw$Enhancer ];
corrRaw$lengthG = width(Gene);
corrRaw$lengthE = width(Enhancer);
corrRaw$strG    = as.character(strand(Gene));
corrRaw$strE    = as.character(strand(Enhancer));

# # compute distance between gene promoters
# proG = resize(Gene, width=1, fix="start");
# proE = resize(Enhancer, width=1, fix="center");
# strand(proG) = "*";
# strand(proE) = "*";
# corrRaw$distance = width(pgap( proG, proE ));
# corrRaw$distance = ifelse(as.character(seqnames(proG)) == as.character(seqnames(proE)), width(pgap( proG, proE )), "NA");
corrRaw[1:4, ]

## Merge the 4 ExG tests: three type of correlations and a chi-square

In [None]:
# Read Chi-square ExG correlation
corrChiSquare = fread(file="../data/scGROv2p8_ExG_1Kpermuted_chisq_empp.csv.gz");
dim(corrChiSquare)
corrChiSquare[1:4,]

corr = left_join(corrChiSquare, corrRaw, by=c("Gene", "Enhancer") ) %>%
       # remove rows with NA:
       na.omit()
dim(corr)
corr[1:6, ]

fwrite(corr, file="../data/scGROv2p8_ExG_max10kbp_ALL_correlation_0p001Exp_with_chisq_empp.csv.gz")

In [None]:
# # CRISPR verified genes and thier SEs bins:
# corrRawF = corrRaw %>%
#     # filter for GN- and thier SEs:
#     filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
#     # Ensure that the CGN and SEs are proper pairs:
#     filter( paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
#     filter( pAdj < 0.05 ) %>%
#     arrange( Gene );
# dim(corrRawF)
# corrRawF
# corrRawF$Enh_Gene

# SE_gene_bin_hist = corrRawF %>%
#     ggplot(aes(x = Enh_Gene)) +
#     geom_histogram() +
#     geom_histogram(bins = 21, position = "identity", color = "white") +
#     geom_density() +
#     xlim(-10, 10) +
#     theme(legend.position="none") +
#     ggtitle("Binary (SE-Gene)") +
#     xlab("Bin difference") +
#     ylab("Number of pairs") +
#     scale_fill_manual(values=c("#39568CFF")) +
#     theme(strip.background = element_blank(),
#         strip.text = element_text(size = 14, face="bold"))
# SE_gene_bin_hist

# summary_bin_hist = data.frame(
#     Neg = sum(corrRawF$Enh_Gene < 0)/length(corrRawF$Enh_Gene),
#     Zero = sum(corrRawF$Enh_Gene == 0)/length(corrRawF$Enh_Gene),
#     Pos = sum(corrRawF$Enh_Gene > 0)/length(corrRawF$Enh_Gene)
#     ) %>%
#     pivot_longer(1:3, names_to = 'Category', values_to = 'Incidence') %>%
#     # use fct_inorder from forcats (part of tidyverse) to keep the original order of the data:
#     ggplot(aes(x = fct_inorder(Category), y = Incidence)) +
#     # geom_bar(stat="identity", fill=c("#9e9a75","gray", "#41533b")) +
#     geom_col(fill=c("#9e9a75","gray", "#41533b")) +
#     theme(legend.position="none") +
#     ggtitle("Binary (SE-Gene)") +
#     xlab("Class") +
#     ylab("Fraction of pairs") +
#     # scale_fill_manual(values=c("#9e9a75","gray", "#41533b")) +
#     theme(strip.background = element_blank(),
#         strip.text = element_text(size = 14, face="bold"))
# summary_bin_hist

## Read corr file:

In [None]:
corrRaw = fread("../data/scGROv2p8_ExG_max10kbp_ALL_correlation_0p001Exp_with_chisq_empp.csv.gz");
dim(corrRaw)

### Co-transcribed genes & enhancers in same chromosomes:

In [None]:
chr_corr = corrRaw %>%
    filter(as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer])));
dim(chr_corr)
chr_corr[1:4, ]

# lookup gene attributes
Gene = expressedFeatures[ chr_corr$Gene ];
Enhancer = expressedFeatures[ chr_corr$Enhancer ];

# compute distance between gene promoters
proG = resize(Gene, width=1, fix="start");
proE = resize(Enhancer, width=1, fix="center");
strand(proG) = "*";
strand(proE) = "*";
chr_corr$distance = width(pgap( proG, proE ));
# corrRaw$distance = ifelse(as.character(seqnames(proG)) == as.character(seqnames(proE)), width(pgap( proG, proE )), "NA");
chr_corr[1:4, ]

In [None]:
chr_corr %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins=40, fill = "#445577", col = "white") +
    xlim(0, 2500000) +
    ggtitle("All features on same chromosome") +
    xlab("Distance between G-E (kb)") +
    ylab("Number of feature pairs");

chr_corr %>%
    filter( corr > 0.05 & pAdj < 0.05 ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins=40, fill = "#445577", col = "white") +
    xlim(0, 2500000) +
    ggtitle("Correlated features on same chromosome") +
    xlab("Distance between G-E (kb)") +
    ylab("Number of feature pairs");

chr_corr %>%
    filter( emp_p_pos < 0.05 ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins=40, fill = "#445577", col = "white") +
    xlim(0, 2500000) +
    ggtitle("Correlated features on same chromosome") +
    xlab("Distance between G-E (kb)") +
    ylab("Number of feature pairs");

chr_corr %>%
    filter(  corr >= 0.075 & pAdj <= 0.05 & emp_p_pos <= 0.05 ) %>%
    ggplot( aes(x=distance)) +
    geom_histogram(bins=40, fill = "#445577", col = "white") +
    xlim(0, 2500000) +
    ggtitle("Correlated features on same chromosome") +
    xlab("Distance between G-E (kb)") +
    ylab("Number of feature pairs");

In [None]:
# all enhancers:
chr_corrAll = chr_corr %>%
    filter( distance <= 2500000 ) %>%
    arrange( Gene );
dim(chr_corrAll)
# make a new column class to plot all vs significant as overlapping histogram:
# chr_corrAll$class = ifelse(chr_corrAll$corr > 0.05 & chr_corrAll$pAdj < 0.05, "correlated pairs", "uncorrelated pairs")
chr_corrAll$class = ifelse(chr_corrAll$corr >= 0.075 & chr_corrAll$emp_p_pos <= 0.05 & chr_corrAll$obsCotrans >= 0.0005, "correlated pairs", "uncorrelated pairs")

summary(as.factor(chr_corrAll$class))
chr_corrAll[1:4, ]

# correlation between Pearson correlation coefficient and observed co-transcription
chr_corrAll %>%
    filter( class == "correlated pairs" ) %>%
    ggplot( aes(x=corr, y=obsCotrans) ) +
    geom_pointdensity(show.legend = F) +
    scale_x_log10(limits=c(0.001,1)) +
    scale_y_log10(limits=c(0.001,1)) +
    scale_color_viridis() +
    xlab("Pearson correlation coefficient") +
    ylab("Observed co-transcription")


chr_corrAll %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", 
                   alpha = 0.5, 
                   # color = "white",
                   # mapping = aes(y = after_stat(count))) +
                   # mapping = aes(y = after_stat(ncount))) +
                   # mapping = aes(y = after_stat(count/sum(count)))) +
                   mapping = aes(y = after_stat(density))) +
    # geom_density() +
    # xlim(0, 2500000) +
    theme(legend.position = c(0.7, 0.85), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.7, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab("Gene-Enhancer distance (kb)") +
    ylab("Density")
ggsave(filename="../plots/GxE_distance_between_co-transcribed_pairs_max10kbp_0p001Exp.pdf", width=4, height=4, units="in")

chr_corrAll %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity") +
    # xlim(0, 2500000) +
    theme(legend.position = "none") +
    guides(fill=guide_legend(title="")) +

    facet_wrap(~class, scales = "free") +
    scale_fill_manual(values=c("#1F968B", "darkgray")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Distance between G-E (bp)") +
    ylab("Number of pairs")
# ggsave(filename="../plots/GxE_distance_between_co-transcribed_pairs_max10kbp_0p001Exp_facets.pdf", width=8, height=6, units="in")

ks.test(distance ~ class, chr_corrAll);

In [None]:
# Genes and SEs:
chr_corrSE = chr_corr %>%
    filter( distance <= 2500000 ) %>%
    filter( substr(Enhancer, 0, 3) == "INT" ) %>%
    arrange( Gene );
dim(chr_corrSE)
# make a new column class to plot all vs significant as overlapping histogram:
# chr_corrSE$class = ifelse(chr_corrSE$corr > 0.05 & chr_corrSE$pAdj < 0.05, "correlated pairs", "uncorrelated pairs") # chr_corrF$corr >= 0.05 & 
chr_corrSE$class = ifelse(chr_corrSE$corr >= 0.075 & chr_corrSE$pAdj <= 0.05 & chr_corrSE$emp_p_pos <= 0.05 & chr_corrSE$obsCotrans >= 0.0005, "correlated pairs", "uncorrelated pairs")

summary(as.factor(chr_corrSE$class))
chr_corrSE[1:4, ]

chr_corrSE %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", 
                   alpha = 0.5, 
                   # color = "white",
                   # mapping = aes(y = after_stat(count))) +
                   # mapping = aes(y = after_stat(ncount))) +
                   # mapping = aes(y = after_stat(count/sum(count)))) +
                   mapping = aes(y = after_stat(density))) +
    # geom_density() +
    # xlim(0, 2500000) +
    theme(legend.position = c(0.7, 0.85), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.7, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab("Gene-SE distance (kb)") +
    ylab("Density")
ggsave(filename="../plots/GxE_distance_between_co-transcribed_Gene-SE_max10kbp_0p001Exp.pdf", width=4, height=4, units="in")

chr_corrSE %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity") +
    # xlim(0, 2500000) +
    theme(legend.position = "none") +
    guides(fill=guide_legend(title="")) +

    facet_wrap(~class, scales = "free") +
    scale_fill_manual(values=c("#1F968B", "darkgray")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Distance between Gene & SE (bp)") +
    ylab("Number of pairs")
# ggsave(filename="../plots/GxE_distance_between_co-transcribed_Gene-SE_facets_max10kbp_0p001Exp.pdf", width=8, height=6, units="in")

ks.test(distance ~ class, chr_corrSE);

In [None]:
summary(chr_corrSE[ chr_corrSE$class == "correlated pairs", chr_corrSE$distance])

In [None]:
test = chr_corr %>%
    filter( distance <= 2500000 ) %>%
    filter( substr(Enhancer, 0, 3) == "INT" ) %>%
    # filter( corr > 0.05 & pAdj < 0.05 ) %>%
    filter(corr >= 0.075 & pAdj <= 0.05 & emp_p_pos <= 0.05 & obsCotrans >= 0.0005) %>%
    arrange( Gene );
dim(test)
test = data.frame( counts = table(test$Enhancer))
test[1:4, ]

test %>%
    ggplot(aes(x = counts.Freq)) +
    geom_histogram(binwidth = 1) +
    theme(legend.position = c(0.7, 0.85), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.7, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab("Frequency of genes correlated with SE") +
    ylab("Number of SE")
ggsave(filename="../plots/GxE_Frequency_of_genes_correlated_with_SE.pdf", width=5, height=4, units="in")

In [None]:
# Genes and TEs:
chr_corrTE = chr_corr %>%
    filter( distance <= 2500000 ) %>%
    filter( substr(Enhancer, 0, 3) == "chr" ) %>%
    arrange( Gene );
dim(chr_corrTE)
# make a new column class to plot all vs significant as overlapping histogram:
# chr_corrTE$class = ifelse(chr_corrTE$corr > 0.075 & chr_corrTE$pAdj < 0.05, "correlated pairs", "uncorrelated pairs")
chr_corrTE$class = ifelse(chr_corrTE$corr >= 0.075 & chr_corrTE$pAdj <= 0.05 & chr_corrTE$emp_p_pos <= 0.05 & chr_corrTE$obsCotrans >= 0.0005, "correlated pairs", "uncorrelated pairs")

summary(as.factor(chr_corrTE$class))
dim(chr_corrTE)
chr_corrTE[1:4, ]

chr_corrTE %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity", 
                   alpha = 0.5, 
                   # color = "white",
                   # mapping = aes(y = after_stat(count))) +
                   # mapping = aes(y = after_stat(ncount))) +
                   # mapping = aes(y = after_stat(count/sum(count)))) +
                   mapping = aes(y = after_stat(density))) +
    # geom_density() +
    # xlim(0, 2500000) +
    theme(legend.position = c(0.7, 0.9), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.7, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab(" Gene-Enhancer distance (kb)") +
    ylab("Density")
ggsave(filename="../plots/GxE_distance_between_co-transcribed_Gene-TE_max10kbp_0p001Exp.pdf", width=4, height=4, units="in")

chr_corrTE %>%
    ggplot( aes(x=distance, fill=class)) +
    geom_histogram(bins = 25, position = "identity") +
    # xlim(0, 2500000) +
    theme(legend.position = "none") +
    guides(fill=guide_legend(title="")) +

    facet_wrap(~class, scales = "free") +
    scale_fill_manual(values=c("#1F968B", "darkgray")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Distance between Gene & TE (kb)") +
    ylab("Number of pairs")
# ggsave(filename="../plots/GxE_distance_between_co-transcribed_Gene-TE_facets.pdf", width=8, height=6, units="in")

ks.test(distance ~ class, chr_corrTE);

In [None]:
summary(chr_corrTE[ chr_corrTE$class == "correlated pairs", chr_corrTE$distance])

In [None]:
chr_corr %>%
    filter( distance <= 2500000 ) %>%
    filter( corr > 0.05 & pAdj < 0.05 ) %>%
    ggplot( aes(x=distance/1000, y=-log10(pAdj)) ) +
    geom_pointdensity(size=0.5) +
    # xlim(0, 2000) +
    scale_color_viridis() +
    ggtitle("Gene pairs on different strands") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

### OSN custom enhancers:

In [None]:
# query_genes = c("GN-Nanog","GN-Sox2","GN-Sox2long", "GN-Pou5f1", "GN-Klf4");
# query_SE = c("INT_STITCHED_3342", "INT_STITCHED_3347", "INT_STITCHED_3348", "INT_STITCHED_3349", "INT_STITCHED_1480", "INT_STITCHED_1482", "INT_STITCHED_7784", "INT_STITCHED_1973")

# # query = c(query_genes, query_SE, query_enh);

In [None]:
# OSN_corr = corr %>%
#     filter( distance <= 500000 ) %>%
#     filter( pAdj < 0.05 ) %>%
#     filter( Gene %in% query_genes & substr(Enhancer, 0, 3) == "INT" );
#     # filter( Gene %in% query_genes & substr(Enhancer, 0, 3) %in% c("Klf", "Pou", " Sox", "Nan") );      
# OSN_corr

### Dendogram & dynamic tree cut

In [None]:
# corrP$r[1:4,1:4]
# dim(corrP$r)

In [None]:
corrF = fread("../data/scGROv2p8_ExG_max10kbp_ALL_correlation_0p001Exp_with_chisq_empp.csv.gz") %>%
    filter(as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer]))) %>%
    # filter( distance <= 500000 ) %>%
    # ignoring custom enhancers at Sox2, Nanog, Klf4, and Pou5f1
    filter( substr(Enhancer, 0, 3) %in% c("chr", "INT") );
dim(corrF)

corrF = corrF %>%
    # filter( corr > 0.05 & pAdj < 0.05 ); 
    # 1545874 
    # filter( corr > 0.1 & pAdj < 0.05 ); 
    # 126666
    filter( corr > 0.075 & pAdj < 0.05 & emp_p_pos < 0.05 & obsCotrans > 0.0005 ) # 49,143
dim(corrF)
table(substr(corrF$Gene, 0, 3));
table(substr(corrF$Enhancer, 0, 3));
summary(corrF$corr)
summary(corrF$emp_p_pos)
corrF[1:4,]

In [None]:
# corrF = corrF %>%
#     # filter( corr > 0.05 & pAdj < 0.05 ); 
#     # 1545874 
#     # filter( corr > 0.1 & pAdj < 0.05 ); 
#     # 126666
#     filter( corr > 0.1 & pAdj < 0.05 & emp_p_pos < 0.05 )
# dim(corrF)
# table(substr(corrF$Gene, 0, 3));
# table(substr(corrF$Enhancer, 0, 3));
# summary(corrF$corr)
# summary(corrF$emp_p_pos)
# corrF[1:4,]

### Gene_x_Enhancer modules

In [None]:
ggnetR = corrF %>%
    # mutate( Gene = sub("GN-", "", Gene) ) %>%
    graph_from_data_frame(directed = F);

In [None]:
plot(ggnetR, 
     vlayout = layout_on_sphere(ggnetR),
     vertex.shape = 'none',
     vertex.label = NA,
     # vertex.color = "#cee2f4",
     # vertex.size=0.1,
     # vertex.label.family = "Helvetica",
     # vertex.label.font = 1,
     # vertex.label.color = "white",
     # vertex.frame.color = "white",
     # edge.color = "gray",
     edge.arrow.size = .1,  
     edge.width = 0.5
    )
ggsave(filename="../plots/ExG_igraph.pdf", width=16, height=16, units="in")

In [None]:
optimize_resolution = function(network, resolution) {
    modules = network %>% 
    cluster_leiden(resolution_parameter = resolution,
                   objective_function = "modularity")

    parsed_modules = data.frame(
        gene_ID = names(membership(modules)),
        module = as.vector(membership(modules)) 
    );

    num_module_10 = table(parsed_modules$module) %>% 
        as.data.frame %>%
        filter(Freq >= 10) %>% 
        nrow();

    num_genes_contained = table(parsed_modules$module) %>% 
        as.data.frame %>%
        filter(Freq >= 10) %>% 
        summarise(sum = sum(Freq)) %>%
        as.numeric()

    return( c(num_module_10, num_genes_contained) );
}

In [None]:
optimization_resultsR = purrr::map_dfc(
  .x = seq(from = 0.5, to = 20, by = 0.5),
  .f = optimize_resolution, 
  network = ggnetR
) %>%
    t() %>%
    cbind( resolution = seq(from = 0.5, to = 20, by = 0.5) ) %>% 
    as.data.frame()
optimization_resultsR

In [None]:
# ggmods0p5 = ggnetR %>%
#     cluster_leiden(resolution_parameter = 0.5, objective_function = "modularity");
ggmods1 = ggnetR %>%
    cluster_leiden(resolution_parameter = 1, objective_function = "modularity");
ggmods1p5 = ggnetR %>%
    cluster_leiden(resolution_parameter = 1.5, objective_function = "modularity");
ggmods2 = ggnetR %>%
    cluster_leiden(resolution_parameter = 2, objective_function = "modularity");
ggmods5 = ggnetR %>%
    cluster_leiden(resolution_parameter = 5, objective_function = "modularity");
ggmods10 = ggnetR %>%
    cluster_leiden(resolution_parameter = 10, objective_function = "modularity");
# ggmods15 = ggnetR %>%
#     cluster_leiden(resolution_parameter = 15, objective_function = "modularity");
# ggmods20 = ggnetR %>%
#     cluster_leiden(resolution_parameter = 20, objective_function = "modularity");

In [None]:
data.frame(
    module = unlist(as.vector(membership(ggmods1)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods1p5)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods2)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods5)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

data.frame(
    module = unlist(as.vector(membership(ggmods10)))
) %>%
    ggplot(aes(x=module)) +
    geom_histogram(binwidth=1) +
    scale_y_log10() +
    xlab("Module number") +
    ylab("Number of genes")

In [None]:
ggmodUSED = ggmods1

In [None]:
data.frame(
    gene_ID = names(membership(ggmodUSED)),
    module = unlist(as.vector(membership(ggmodUSED)))
) %>%
    group_by(module) %>%
    mutate( mod.size=dplyr::n() ) %>%
    ungroup() %>%
    mutate( gene_ID = sub("GN-", "", gene_ID, fixed=T) ) %>%
    arrange_at("module") %>%
    fwrite("../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_withEmpPpos_res1.csv");

In [None]:
# only select modules that are bigger than 5 genes:
ggmodUSED = groups(ggmodUSED)[lengths(groups(ggmodUSED)) > 10]
length(ggmodUSED)
ggmodUSED[1]

In [None]:
ggmodUSEDgenes = lapply(ggmodUSED, function(y) {
        y = y[substr(y, 0, 3) == "GN-"];
        y = sub("GN-", "", y)
        return(unname(y));
    })
ggmodUSEDgenes[1]

In [None]:
suppressMessages({
    entrezids = lapply(ggmodUSEDgenes, function(x) {
        x = mapIds(org.Mm.eg.db, x, 'ENTREZID', 'SYMBOL');
        return(unname(x));
    })
});

universe = entrezids %>% unlist %>% unique;

In [None]:
res = compareCluster(
    geneClusters = entrezids, 
    fun = "enrichGO", # ORA function to apply to each cluster
    # Arguments below are passed to enrichGO
    OrgDb = "org.Mm.eg.db",
    keyType = "ENTREZID", 
    ont = "ALL", # BP, CC, MF, or ALL for all ontologies
    pvalueCutoff = 0.01,
    qvalueCutoff = 0.05,
    pAdjustMethod = "BH", # p-values are adjusted within clusters
    universe = universe,
    minGSSize = 5,
    maxGSSize = 1000
    ) %>%
    setReadable(., OrgDb = org.Mm.eg.db, keyType = "ENTREZID");
dim(res)

In [None]:
# saveRDS(res, file = "../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_res2.rds");

saveRDS(res, file = "../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_withEmpPpos_res1.rds");

In [None]:
res %>%
    # mutate(geneID=NULL) %>%
    head(n=100)

In [None]:
res %>%
    arrange(Cluster, pvalue) %>%
    # mutate(geneID=NULL) %>%
    # fwrite("../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_res1_enrichGO.csv")
    write.table("../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_withEmpPpos_res1_enrichGO.csv", sep='\t', quote=F, col.names=T, row.names=T)

In [None]:
# res = readRDS("../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_res2.rds")
# dim(res)

# res = readRDS("../data/scGROv2p8_ExGmodules_max10kbp_binary_corr_withEmpPpos_res1.rds");
# dim(res)

### make bed file of genes in the GO class for motif discovery with HOMER

In [None]:
# get bed files for genes in GOterm ONLY for genes if the co-expressed partner is also in the GOterm:
bedModules = function( corr, expressedFeatures, submodule ){
    # get features in the submodule:
    proms =  expressedFeatures[ unlist(submodule) ] %>%
        # filter features such that they have to be present in the correlated matrix
        filter( names %in% c(corr$Gene, corr$Enhancer)) %>%
        # 750 of gene is -500 to +250, as the gene starts are trimmed 250 nt
        # It is redundant for enhancers and the two 750 nt blocks overlap 500 nt
        promoters( upstream = 750, downstream = 0 ) %>%
        # removing the excess 250 nt
        # the resulting 500 nt is just promoter of genes and center of enhancers
        anchor_5p() %>%
        resize( width = 500 ) %>%
        # promoters( ifelse(substr(names, 0, 3) == "GN", 
        #                   (upstream = 750, downstream = 0), 
        #                   (upstream = 500, downstream = 0)) ) %>%
        data.frame();
        return(proms); 
}

In [None]:
# select resolution:
ggmodUSED = ggmods1
resName = "res1"

# only select modules with more than 10 members:
ggmodUSED = groups(ggmodUSED)[lengths(groups(ggmodUSED)) > 10];

# make a directory
# write bed files of gene promoters in each GO IDs:
dir.name = paste0("scGROv2p8_ExGmodules_max10kbp_binary_corrModules_promoters_", resName)
dir.create(paste0("../data/", dir.name));

suppressMessages({
    foreach(
        submodule = 1:length(ggmodUSED)
    ) %do% {
        feat_promoters = bedModules(corrF, expressedFeatures, ggmodUSED[submodule]);
        export.bed(feat_promoters, con=paste0( "../data/", dir.name, "/", "ExGmodules_", 
                                              resName, "submod_", submodule, ".bed"));
        return();
    }
});

In [None]:
bkg = expressedFeatures %>%
        promoters( upstream = 750, downstream = 0 ) %>%
        anchor_5p() %>%
        resize( width = 500 ) %>%
        # filter enhancers in "-" strand (Redundant with "+" off in position)
        filter( substr(name, 0, 3) == "GN-" | (substr(name, 0, 3) != "GN-" & strand == "+") ) %>%
        data.frame();
bkg[1:4, ]

export.bed(bkg, con="../data/scGROv2p8_ExGmodules_background_promoters.bed");

In [None]:
# results = res %>%
#     filter( ID %in% c("GO:0043488", "GO:0010608", "GO:0003823", "GO:0016887", "GO:0030234", "GO:0010528", 
#                       "GO:1990837", "GO:0005884", "GO:0008106", "GO:0005740", "GO:0072687") ) %>%
#     # redundant with cluster 66 and GO:1990837
#     filter( !GeneRatio %in% c("66/481") );
#     # mutate(geneID=NULL);
# dim(results)
# results %>%
#     arrange(Cluster, pvalue) %>%
#     head( n=100 );

In [None]:
# dotplot(results,
#         x = "Cluster",
#         color = "p.adjust",
#         showCategory = 5,
#         split = NULL,
#         font.size = 12,
#         title = "",
#         by = "geneRatio",
#         size = NULL,
#         includeAll = TRUE,
#         label_format = 30,
# )
# ggsave(filename="../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_res15_dotplot.pdf", width=8, height=8, units="in")

In [None]:
# bplot = results %>%
#     arrange(Cluster, pvalue) %>%
#     data.frame() %>%
#     # separate values in a column by "/" and store the 1st and 2nd object in "Overlap", "Total":
#     separate( GeneRatio, c("Overlap", "Total"), "/") %>%
#     mutate( GeneRatio = as.numeric(Overlap) / as.numeric(Total) ) %>%
#     ggplot( aes(x = GeneRatio, y = fct_inorder(Description), fill=qvalue)) + 
#     geom_bar( stat = 'identity' ) +
#     ylab(NULL) +
#     scale_fill_viridis()
# ggsave(filename = "../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_res15_barplot.pdf", width=12, height=8, units="in")

In [None]:
# eplot = pairwise_termsim(results)
# emapplot(eplot, showCategory = 30)
# ggsave(filename = "../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_res15_emapplot.pdf", width=4, height=4, units="in")

In [None]:
# # to remove redundant terms:
# simplifiedResults = simplify(results, cutoff=0.6, by="p.adjust", select_fun=min)
# eplotS = pairwise_termsim(simplifiedResults)
# emapplot(eplotS, showCategory = 10)
# ggsave(filename = "../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_res15_emapplot_simplified.pdf", width=4, height=4, units="in")

In [None]:
# cplot = cnetplot(results,
#                  showCategory = 5,
#                  # foldChange = NULL,
#                  layout = "kk",
#                  colorEdge = FALSE,
#                  circular = FALSE,
#                  node_label = "all",
#                  cex.params = list(category_node = 1, gene_node = 1, category_label = 1, gene_label = 1),
#                  color_category = "#E5C494",
#                  color_gene = "#B3B3B3",
#                  shadowtext = "all"
#                  # color.params = list(foldChange = NULL, edge = FALSE, category = "#E5C494", gene ="#B3B3B3"),
#                  # cex.params = list(category_node = 1, gene_node = 1, category_label = 1, gene_label = 1),
#                  # hilight.params = list(category = NULL, alpha_hilight = 1, alpha_no_hilight = 0.3)
# )
# ggsave(filename = "../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_res15_cnetplot.pdf", width=4, height=4, units="in")

### network diagram using igraph:

In [None]:
layouts = grep("^layout_",ls("package:igraph"),value=TRUE)[-1]
layouts

In [None]:
# to make network plot using the gens from GOterm
networkPlot = function(corrMatrix, GOresult, GOterm ){
    genesList = getGOgenes(GOresult, GOterm);
    net = corrMatrix %>%
        filter( corr > 0.075 & pAdj <= 0.05 ) %>%
        mutate( Gene = sub("GN-", "", Gene) ) %>%
        filter( Gene %in% rownames(genesList) );
    
    net = graph_from_data_frame(net, directed = F) 

    # conditional color based on pAdj:
    # E(net)$color = ifelse(E(net)$pAdj <= 0.05 & E(net)$corr >= 0.1, "#ce968b", "gray")
    # conditional on corr value:
    E(net)$color = ifelse(E(net)$corr > 0.1, "tomato", "gray")
    # V(net)$label.color = ifelse(substr(V(net), 0, 3) == "GN-", "black", "white")

    netPlot = plot(net, 
                   vlayout = layout_on_sphere(net),
                   vertex.label.family = "Helvetica",
                   vertex.label.font = 1,
                   edge.arrow.size = .1, 
                   # edge.color = "gray", 
                   edge.width = 3,
                   vertex.color = "#cee2f4", 
                   vertex.label.color = "black",
                   vertex.frame.color = "white")
    
    return(netPlot);
    
    # ggsave(filename=sprintf("../plots/scGROv2p8_GxGmodules_max10kbp_binary_corr_%s%_network.pdf", GOterm), width=12, height=12, units="in")
}

In [None]:
mRNA_stability = networkPlot(corrF, results, "GO:0043488")
# ggsave(filename = "../plots/scGROv2p8_ExGmodules_max10kbp_binary_corr_GO-0048002_network_test.png", width=12, height=12, units="in")

# SAVE DOES NOT WORK. SAVE MANUALLY AS SVG BY RIGHT CLICKING.

In [None]:
postTranscriptional_gene_regulation = networkPlot(corrF, results, "GO:0010608")

In [None]:
antigen_binding = networkPlot(corrF, results, "GO:0003823")

In [None]:
ATP_hydrolysis_activity = networkPlot(corrF, results, "GO:0016887")

In [None]:
enzyme_regulator_activity = networkPlot(corrF, results, "GO:0030234")

In [None]:
regulation_transposition = networkPlot(corrF, results, "GO:0010528")

In [None]:
sequence_specific_DNA_binding = networkPlot(corrF, results, "GO:1990837")

In [None]:
actin_filament = networkPlot(corrF, results, "GO:0005884")

In [None]:
alcohol_dehydrogenase_NADP_activity = networkPlot(corrF, results, "GO:0008106")

In [None]:
mitochondrial_envelope = networkPlot(corrF, results, "GO:0005740")

In [None]:
meiotic_spindle = networkPlot(corrF, results, "GO:0072687")

### Ideogram

In [None]:
mm10_karyotype = fread("../data/mm10_karyotype.csv");
mm10_gene_density = fread("../data/mm10_gene_density.csv");

In [None]:
# gets both genes and enhancers:
getGOfeatures = function( corrMatrix, GOresult, GOterm ){
    selGenes = data.frame(GOresult) %>%
    filter( ID %in% GOterm );
    geneIDs = lapply(selGenes$geneID, function(x) {
        Ids = unlist(strsplit(gsub("/", ',', x), ","));
        # Ids = mapIds(org.Mm.eg.db, Ids, 'SYMBOL', 'ENTREZID');
        Ids = c(unname(Ids));
        return(Ids);
    })
    geneIDs = unlist(geneIDs)
        
    # select corr pairs of geneIDs:
    net = corrMatrix %>%
        filter(  pAdj < 0.05 ) %>% #  corr > 0.075 &
        filter( sub("GN-", "", Gene) %in% geneIDs );
    
    selGeneIDs = features %>%
        mutate( name = names) %>%
        filter( names %in% c(net$Gene, net$Enhancer)) %>%
        data.frame();

    markerGenes = data.frame(
    Type = ifelse(substr(selGeneIDs$name, 0, 3) == "GN-", "Gene", ifelse(substr(selGeneIDs$name, 0, 3) == "chr", "Enhancer", "SE")),
    Shape = ifelse(substr(selGeneIDs$name, 0, 3) == "GN-", "circle", ifelse(substr(selGeneIDs$name, 0, 3) == "chr", "triangle", "box")),
    # Shape = ifelse(substr(selGeneIDs$name, 0, 3) == "GN-", "circle", "triangle"),
    Chr = selGeneIDs$seqnames,
    Start = selGeneIDs$start,
    End = selGeneIDs$end,
    # color = ifelse(substr(selGeneIDs$name, 0, 3) == "GN-", "3a4664", "cc8921")
    color = ifelse(substr(selGeneIDs$name, 0, 3) == "GN-", "3a4664", ifelse(substr(selGeneIDs$name, 0, 3) == "chr", "cc8921", "cf4a49")))
    
    rownames(markerGenes) = sub("GN-", "", selGeneIDs$name);
    
    return(markerGenes);
}

In [None]:
GO_0010608 = getGOfeatures(corrF, results, "GO:0010608");
dim(GO_0010608)
GO_0010608#[1:4, ]

GO_0003823 = getGOfeatures(corrF, results, "GO:0003823");
dim(GO_0003823)
GO_0003823#[1:4, ]

GO_0010528 = getGOfeatures(corrF, results, "GO:0010528");
dim(GO_0010528)
GO_0010528#[1:4, ]

GO_1990837 = getGOfeatures(corrF, results, "GO:1990837");
dim(GO_1990837)
GO_1990837#[1:4, ]

GO_0008106 = getGOfeatures(corrF, results, "GO:0008106");
dim(GO_0008106)
GO_0008106#[1:4, ]

GO_0005740 = getGOfeatures(corrF, results, "GO:0005740");
dim(GO_0005740)
GO_0005740#[1:4, ]


GO_00 = getGOfeatures(corrF, results, c("GO:0010608", "GO:0003823", "GO:0010528", "GO:1990837", "GO:0008106", "GO:0005740"));
dim(GO_00)
GO_00[1:4, ]

summary(as.factor(GO_00$Chr))

In [None]:
ideogram(karyotype = mm10_karyotype, # mm10_karyotype[1:10,] # for chrs 1:10 only
         # overlaid = mm10_gene_density,  
         label = GO_00, 
         label_type = "marker", 
         # position of legend
         Lx = 60, Ly = 35,
         output = "../plots/Ideogram_GO_scGROv2p8_ExGmodules_max10kbp_binary_corr_res15.svg")
# svg2pdf("../plots/Ideogram_ggmodules_3xtimebins10kbp_ChrConstrain_res4.svg")
# convertSVG("../plots/Ideogram_GO_0002475_scGROv2p8_ExGmodules_max10kbp_binary_corr.svg", device = "png")

In [None]:
# get bed files for genes in GOterm ONLY for genes if the co-expressed partner is also in the GOterm:
bedGOfeatures = function( corrMatrix, GOresult, GOterm ){
    genesList = getGOfeatures(corrMatrix, GOresult, GOterm);
    net = corrMatrix %>%
        # Even if the corr instead of corrF matrix is used, it ensures that we consider relatively strongly co-Ex genes
        filter( corr > 0.075 & pAdj < 0.05 ) %>%
        filter( sub("GN-", "", Gene) %in% rownames(genesList) & Enhancer %in% rownames(genesList) );
        
    selGeneIDs = features %>%
        # mutate( name = sub("GN-", "", names)) %>%
        filter( names %in% c(net$Gene, net$Enhancer)) %>%
        promoters( upstream = 750, downstream = 750 ) %>%
        data.frame();
    
    return(selGeneIDs);
}

In [None]:
# make a directory
# write bed files of gene promoters in each GO IDs:
dir.name = "scGROv2p8_GxGmodules_max10kbp_binary_corr_res2_GO_promoters"
dir.create(paste0("../data/", dir.name));

suppressMessages({
    foreach(
        id = data.frame(results)$ID
    ) %do% {
        # use corr instead of corrF. function filters orr >= 0.1 & pAdj <= 0.05
        GO_promoters = bedGOgenes(corr, results, id);
        export.bed(GO_promoters, con=paste0( "../data/", dir.name, "/", sub(":", "_", id), ".bed"));
        return();
    }
});