In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(ggplot2)
    library(dplyr)
    library(matrixStats)
    library(foreach)
    library(doParallel)
    library(plyranges)
    library(viridis)
    library(ggpointdensity)
    library(Matrix)
    library(data.table)
    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    library(org.Mm.eg.db)
    library(rtracklayer)
});

In [None]:
registerDoParallel(16);
options(
    repr.plot.width=4,
    repr.plot.height=3,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# specify maximum gene length to be used for G-E correlation:
geneLength = 10000;

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");

# load features
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
# features$name=NULL;
features$score=NULL;
features

In [None]:
# genes shorter than 1500 are: 2931
# genes shorter than 1000 are: 2209
# genes shorter than  500 are:  484
# genes shorter than  200 are:  147

# short Gm### genes
shortGm = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    filter( width > 200 ) %>%
    filter( width < 1000 & substr(names, 0, 5) == "GN-Gm" );
length(shortGm)
summary(width(shortGm))
names(shortGm)[1:5]

# define highly expressed "blacklist" genes to be
# removed from correlations
blacklist = grep( "GN-Rp[ls]" , names(features) );
blacklist = grep( "GN-Sn[oh]", names(features) ) %>%
    append(blacklist);

blacklist = names(features)[ blacklist ];
length(blacklist)
summary(width(features[ blacklist ]))
blacklist[c(1:5, 110:115)]

In [None]:
# filter genes
genes = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    # remove genes less than 1500 nt
    filter( width >= 1500 ) %>%
    # filter genes with dREG peaks
    filter( names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
    # filter highly expressed genes
    filter(!names %in% blacklist ) %>%
    # filter Gm genes shorter than 1 kb
    filter(!names %in% names(shortGm) ) %>%
    # remove 500 nt at the 5' end
    anchor_3p() %>%
    mutate( width = width - 500 );
summary(width(genes))
genes

In [None]:
enhancers = features %>%
    filter( substr(names, 0, 3) != "GN-" ) %>%
    filter( width >= 3000 );
summary(width(enhancers))
table(substr(enhancers$name, 0, 3));
enhancers

In [None]:
# load groHMM-extended genes and enhancers
dREG = read.table("../data/PROseq_mES_BRsComb.dREG.peak.full.bed", header=F, stringsAsFactors=F);
colnames(dREG) = c("chr", "start", "end", "score", "pval", "center");
# use the "center" column as start
# colnames(dREG) = c("chr", "leftEnd", "rightEnd", "score", "pval", "start");
# dREG$end = dREG$start;
dREG = GRanges(dREG);
summary(dREG$score)
 # dREG = dREG %>%
 #    # removing dREG peaks with score lower than mean
 #    filter( score > median(score) );

# mark dREG peaks by the overlapping enhancers (from freatures):
hits = findOverlaps( dREG, enhancers, ignore.strand=T );
dREG$assignedEnhancer = NA;
# assign ovelapping SE to dREG peaks
dREG$assignedEnhancer[hits@from] = names(enhancers)[hits@to];

# filter dREG peak to only leave one with highest score:
dREGfilt = dREG %>%
    # filter dREG peaks that do not overlap with an enhancer
    na.omit() %>%
    group_by( assignedEnhancer ) %>%
    filter( score == max(score) ) %>%
    ungroup() %>%
    # make data table for left_join
    as.data.table;
dim(dREGfilt)
table(substr(dREGfilt$assignedEnhancer, 0, 3));
length(unique(dREGfilt$assignedEnhancer))
dREGfilt[1:4, ]

In [None]:
Ecenter = left_join( as.data.table(enhancers), dREGfilt[,c(6:9)], by=c("name"="assignedEnhancer") ) %>%
    na.omit() #%>%
    # GRanges()
# names(Ecenter) = Ecenter$name;
Ecenter$pval = NULL;
Ecenter$score = NULL;
Ecenter[1:4, ]

In [None]:
# create Enhancer bin in sense direction
Edown = Ecenter %>%
    mutate( strand = "+" ) %>%
    #convert center to start
    mutate( start = center ) %>%
    # make sure the end is at least 1750 For > 1750, keep as it is
    mutate( end = ifelse(end - center > 1750, end, center + 1750) ) %>%
    GRanges() %>%
    anchor_3p() %>%
    mutate( width = width - 250 )
    # # no need for this beacuse the genes will be filtered against enhancers:
    # subsetByOverlaps( genes, invert = TRUE );
names(Edown) = Edown$name;
Edown$center = NULL;
Edown
# create Enhancer bin in anti-sense direction
Eup = Ecenter %>%
    mutate( strand = "-" ) %>%
    mutate( end = center) %>%
    mutate( start = ifelse(center - start > 1750, start, center - 1750) ) %>%
    GRanges() %>%
    anchor_3p() %>%
    mutate( width = width - 250 );
names(Eup) = Eup$name;
Eup$center = NULL;
Eup

In [None]:
# combine features:
expressedFeatures = c(Eup, Edown, genes) %>%
     sort();
summary(width(expressedFeatures))
expressedFeatures

# trim the long genes to 10 kb
longf = which( width(expressedFeatures) >= 10000 );
expressedFeatures[longf] = expressedFeatures[longf] %>%
    resize( width = 10000, fix="start" );
summary(width(expressedFeatures))
expressedFeatures

In [None]:
# filter out features that overlap:
hits = findOverlaps(expressedFeatures)
expressedFeatures$overlappingFeature = NA;
# assign ovelapping feature
expressedFeatures$overlappingFeature[hits@from] = names(expressedFeatures)[hits@to];

test = expressedFeatures %>%
    na.omit() %>%
    group_by( overlappingFeature ) %>%
    filter( n_distinct(name) == 1 ) %>%
    ungroup();
test

In [None]:
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS("../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds");
#counts = counts[,colSums(counts) >= 1000 ];
dim(counts)
allcells = colnames(counts);
table(substr(allcells, 0, 6));

In [None]:
# quick check for the expression of OSN genes:
test = counts[rownames(counts) %in% c("GN-Pou5f1","GN-Sox2","GN-Nanog","GN-Sox2long"),]
t(rowMeans(test))

Pou5f1 = counts[rownames(counts) == c("GN-Pou5f1"),]
table(Pou5f1)
Nanog = counts[rownames(counts) == c("GN-Nanog"),]
table(Nanog)
Sox2 = counts[rownames(counts) == c("GN-Sox2"),]
table(Sox2)
Sox2long = counts[rownames(counts) == c("GN-Sox2long"),]
table(Sox2long)

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( mapq >= 3 & countQC & umiQC & plateQC & cellQC & miRQC ) %>%
    subsetByOverlaps( expressedFeatures ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    filter( cellID %in% allcells ) #%>%
    # select( cellID );
scGRO

In [None]:
counts = foreach(
    id = allcells,
    .combine="cbind2"
) %dopar% {
    reads = scGRO %>% filter(cellID == id);
    # count reads in each feature for this cellID
    counts1 = countOverlaps( expressedFeatures, reads );

    counts = cbind(counts1) %>%
        Matrix(sparse=T);

    return( counts );
}
# columns = cells = cellIDs
colnames(counts) = allcells;
rownames(counts) = names(expressedFeatures);
dim(counts)

In [None]:
# to sum rows with same rownames
# the sense and anti-sense bins of enhancers have same names
counts = rowsum(counts, row.names(counts))
dim(counts)
# rowSums(counts)
# colSums(counts)

In [None]:
# only retain features with counts in 0.1% or more cells
observed = counts[ rowMeans(counts>0) >= 0.001, ];
dim(observed)

test = substr(rownames(observed), 0, 3)
summary(as.factor(test))

genes = genes %>%
    filter(names %in% rownames(observed));
length(genes);
enhancers = enhancers %>%
    filter(names %in% rownames(observed));
length(enhancers);

In [None]:
summary(colSums(observed))

In [None]:
# convert read counts into probability -- each cell column sums to 1
pmatrix = t(t(observed) / colSums(observed));
# average across cells
pvector = rowMeans(pmatrix);

In [None]:
# Define important dimensions of our data
Niters = 1000;
Ncells = ncol(observed);
Ngenes = length(genes);
Nenh   = length(enhancers);
Nreads = colSums(observed);
samplesize = sum(Nreads);

# repeat each cellID by its read count
cell_index = rep(1:Ncells, times=Nreads);

In [None]:
# clear unsed memory
gc()
# memory.size() ### Checking your memory size
# memory.limit() ## Checking the set limit
# memory.limit(size=56000) ### expanding your memory _ here it goes beyond to your actually memory. This 56000 is proposed for 64Bit. 

In [None]:
# Binarize observed counts
obs_genes = observed[names(genes),];
obs_genes[ obs_genes>1 ] = 1;
obs_enh = observed[names(enhancers),];
obs_enh[ obs_enh>1 ] = 1;

# Multiply each Ngenes x Ncells matrix by its
# transpose within each iteration.
# Because data is binary, this counts co-occurence
# of 1's among all genes pairs.
#obsx = obs_genes %*% t(obs_enh) / Ncells;

# implement chisq test
# compute expected coexpression
GxE = rowMeans(obs_genes) %*% t(rowMeans(obs_enh));
obsGE = as.matrix(obs_genes %*% t(obs_enh)) / Ncells;
obs_chisq = (obsGE-GxE)^2/GxE;
obs_chisq = obs_chisq * ifelse(obsGE>GxE, 1, -1);
dim(obsGE)

In [None]:
# initialize matrix to compute empirical p-values
emp_p_pos = matrix(0, nrow=Ngenes, ncol=Nenh);
emp_p_neg = matrix(0, nrow=Ngenes, ncol=Nenh);
# emp_p     = matrix(0, nrow=Ngenes, ncol=Nenh);

for( n in 1:Niters ) {
    # report progress every 100 iterations
    if( n %% 100 == 0 ) {
        message(n);
    }
    
    # randomly sample from genes with replacement
    simx = sample( Ngenes+Nenh, size=samplesize, replace=T, prob=pvector );
    # assign sampled genes to cells based on read count
    simx = cbind(simx, cell_index);
    # binarize
    simx = unique(simx);
    simx = sparseMatrix(i=simx[,1], j=simx[,2], x=1, dims=c(Ngenes+Nenh, Ncells));
    rownames(simx) = rownames(observed);
    colnames(simx) = colnames(observed);
    
    # compute coexpression
    #simx = simx[names(genes),] %*% t(simx[names(enhancers),]) / Ncells;
    # compare to observed coexpression
    #emp_p = emp_p + (simx >= obsx);
    
    # compute chisq and p-values
    suppressWarnings({
        GxE = rowMeans(simx[names(genes),]) %*% t(rowMeans(simx[names(enhancers),]));
        simGE = as.matrix(simx[names(genes),] %*% t(simx[names(enhancers),])) / Ncells;
        sim_chisq = (simGE-GxE)^2/GxE;
        sim_chisq = sim_chisq * ifelse(simGE>GxE, 1, -1);
        emp_p_pos = emp_p_pos + !(obs_chisq > sim_chisq);
        emp_p_neg = emp_p_neg + !(obs_chisq < sim_chisq);
        # emp_p_pos = emp_p_pos + (!(GxE > simGE & obs_chisq > sim_chisq));
        # emp_p_neg = emp_p_neg + (!(GxE < simGE & obs_chisq > sim_chisq));
        # also add the old method of p-value by just observing the incidence of obsx smaller than simx
        # emp_p = emp_p + (simGE >= GxE);
    });
}

In [None]:
GxE = rowMeans(obs_genes) %*% t(rowMeans(obs_enh));
chr_corr = data.frame(
    Gene       = rep(1:Ngenes, Nenh),
    Enhancer   = rep(1:Nenh  , each=Ngenes),
    pGene      = rep(rowMeans(obs_genes), Nenh),
    pEnh       = rep(rowMeans(obs_enh), each=Ngenes),
    expCotrans = as.vector(GxE),
    obsCotrans = as.vector(obsGE),
    chisq      = as.vector(obs_chisq),
    emp_p_pos  = as.numeric(emp_p_pos) / Niters, # correlated cotranscription
    emp_p_neg  = as.numeric(emp_p_neg) / Niters  # anticorrelated cotranscription
    # emp_p      = as.numeric(emp_p) / Niters  # anticorrelated cotranscription
);
chr_corr$Gene = as.factor(names(genes)[chr_corr$Gene]);
chr_corr$Enhancer = as.factor(names(enhancers)[chr_corr$Enhancer]);

In [None]:
# fwrite(chr_corr, file="../data/scGROv2p8_Gene10kb_Enh_v1genes_1Kpermuted_chisq_empp.csv.gz");
# fwrite(chr_corr, file="../data/scGROv2p8_Gene10kb_Enh_genes_10Kpermuted_chisq_empp.csv.gz");
fwrite(chr_corr, file="../data/scGROv2p8_ExG_1Kpermuted_chisq_empp.csv.gz");

In [None]:
dim(chr_corr)

In [None]:
test = chr_corr %>%
    filter(emp_p_pos != "NA")
dim(test)

## Read file that has been written for analyses:

In [None]:
# chr_corrInp = fread(file="../data/scGROv2p8_Gene10kb_Enh_v1genes_1Kpermuted_chisq_empp.csv.gz");
# chr_corrInp = fread(file="../data/scGROv2p8_Gene10kb_Enh_genes_10Kpermuted_chisq_empp.csv.gz");
chr_corrInp = fread(file="../data/scGROv2p8_ExG_1Kpermuted_chisq_empp.csv.gz");
dim(chr_corrInp)

In [None]:
chr_corr = chr_corrInp %>%
    filter( as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer] ))) %>%
    filter( emp_p_pos <= 0.2 | emp_p_neg <= 0.1 );

In [None]:
# lookup gene attributes
feat1 = expressedFeatures[ chr_corr$Gene ];
feat2 = expressedFeatures[ chr_corr$Enhancer ];
chr_corr$geneLength = width(feat1);
chr_corr$enhsLength = width(feat2);
chr_corr$geneStrand = as.character(strand(feat1));
chr_corr$enhsStrand = as.character(strand(feat2));

In [None]:
# compute distance between gene promoters
pro1 = resize(feat1, width=1, fix="start");
pro2 = resize(feat2, width=1, fix="start");
strand(pro1) = "*";
strand(pro2) = "*";
chr_corr$distance = width(pgap( pro1, pro2 ));

chr_corr = chr_corr %>% filter(distance < 2e6);
chr_corr$fdr_pos = p.adjust(chr_corr$emp_p_pos, method = "fdr");
nrow(chr_corr)

In [None]:
head(chr_corr)

In [None]:
chr_corr %>%
    ggplot( aes(x=distance/1000)) +
    geom_histogram(binwidth=50) +
    xlim(0, 2000) +
    ggtitle("Correlated gene-enhancer pairs") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");

In [None]:
chr_corr %>%
    filter( distance <= 200000 & emp_p_pos <= 0.05 ) %>%
    ggplot( aes(x=distance, y=-log10(emp_p_pos)) ) +
    geom_point(size=0.5) +
    xlim(0, 100000) +
    scale_color_viridis() +
    geom_smooth(method = "lm", color="red", alpha=0.5, formula = y ~ x) +
    ggtitle("Gene-enhancer pairs") +
    xlab("Distance between pair (bp)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( distance <= 200000 & emp_p_neg <= 0.05 ) %>%
    ggplot( aes(x=distance, y=-log10(emp_p_neg)) ) +
    geom_smooth(method = "lm", color="red", alpha=0.5, formula = y ~ x) +
    geom_point(size=0.5) +
    xlim(0, 100000) +
    scale_color_viridis() +
    ggtitle("Gene-enhancer pairs Negatively co-transcription") +
    xlab("Distance between pair (bp)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( emp_p_pos <= 0.05 ) %>%
    ggplot( aes(x=distance/1000, y=-log10(emp_p_pos)) ) +
    geom_pointdensity(size=0.5) +
    geom_smooth(method = "lm", color="red", alpha=0.5, formula = y ~ x) +
    xlim(0, 1000) +
    scale_color_viridis() +
    ggtitle("Feature pairs") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( emp_p_neg <= 0.05 ) %>%
    ggplot( aes(x=distance/1000, y=-log10(emp_p_neg)) ) +
    geom_pointdensity(size=0.5) +
    geom_smooth(method = "lm", color="red", alpha=0.5, formula = y ~ x) +
    xlim(0, 1000) +
    scale_color_viridis() +
    ggtitle("Feature pairs Negatively co-transcribed") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

In [None]:
query_genes = c("GN-Nanog","GN-Sox2","GN-Sox2long", "GN-Pou5f1", "GN-Klf4");
query_enh = c(
    "Sox2_200kbUp",
    "Sox2_2kbDn",
    "Sox2_SRR1_4kbUp", 
    "Sox2_SRR2_2kbDn", 
    "Sox2_18kbDn", 
    "Sox2_85kbDn", 
    "Sox2_95kbDn",
    "Sox2_106kbDn",
    "Sox2_107kbDn",
    "Sox2_109kbDn",
    "Sox2_111kbDn",
    "Sox2_105_109kbDn",
    "Sox2_109_112kbDn",
    "Sox2_SCR_Dn",
    "Sox2_105kbDn",
    "Sox2_1MbDn",
    "Klf4_E1_67kbUp",
    "Klf4_E2_58kbUp", 
    "Klf4_E3_56kbUp",
    "Klf4_E123_55kbUp",
    "Klf4_50kbUp",
    "Nanog_5kbUp",
    "Nanog_45kbUp",
    "Nanog_60kbDn",
    "Pou5f1_25kbUp",
    "Pou5f1_20kbUp",
    "Pou5f1_3kbUp",
    "Pou5f1_39kbDn",
    "Pou5f1_80kbDn"
);

query_enh = c(
    query_enh,
    paste0( query_enh, "_pl" ),
    paste0( query_enh, "_mn" )
);
query_enh = query_enh[ query_enh %in% names(features) ];

query = c(query_genes, query_enh);

In [None]:
OSN_corr = chr_corrInp %>%
    filter( as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer] ))) %>%
    filter( Gene %in% query & Enhancer %in% query );
dim(OSN_corr)
OSN_corr$fdr_pos = p.adjust(OSN_corr$emp_p_pos, method = "fdr");



OSN_corr %>%
    filter( emp_p_pos <= 0.2 | emp_p_neg <= 0.1 );
    # filter( fdr_pos >= 0.05 );
    # filter( Gene %in% query & Enhancer %in% query );# %>%
    # filter( Gene %in% query_genes                 );# %>%
    # filter(                 Enhancer %in% query ) %>%
    # filter( emp_p_pos <= 0.5 & distance < 1000000 );

In [None]:
SEmm10 = read_bed("~/group/genes_enhancer_list/mm10_mESC_OSN_SE_nonOverlappingWithGenes_v2.bed")
length(SEmm10)
SEmm10

# get all features within 1MB of SEs
SEneighbors = subsetByOverlaps(genes, SEmm10, maxgap=1000000);
hits = distanceToNearest( SEneighbors, SEmm10, ignore.strand=T );
#hits
SEneighbors$SE = SEmm10$name[hits@to];
SEneighbors$distance = hits@elementMetadata$distance;
SEneighbors
write.csv(SEneighbors, file="../data/SEs_neighbor_genes.csv")

SEs = SEneighbors$SE
SEgenes = names(SEneighbors)
query_SE = c(SEs, SEgenes);
length(unique(SEgenes))
length(unique(SEs))

In [None]:
SEcorr = chr_corr %>%
    filter( Gene %in% query_SE & Enhancer %in% query_SE ) %>%
    filter( emp_p_pos < 0.05 & distance <= 100000 ) %>%
    filter(pGene >= 0.025 & pEnh >= 0.025 )#; %>%
    # filter( substr(Gene, 0, 5) != "GN-Gm");

length(unique(SEcorr$Gene))
length(unique(SEcorr$Enhancer))
# % of SEs that have a protein coding gene within the distance limit:
length(unique(SEcorr$Enhancer))/length(unique(SEs))*100

SEcorr

In [None]:
## SE assigned genes by the original paper:
SEmm9 = read.csv("~/group/mES_PROseq_RNAseq_ATACseq/SE_Whyte_Young_2013/SE_proximalGenes_mm9.csv")
SEmm9 = GRanges(SEmm9) %>%
    filter( isSuper == "YES");
SEmm9
length(unique(SEmm9$proximal_gene))
    
glist = read.table("~/group/genes_enhancer_list/mm10RefFlat_glist.txt");
glist = glist[2:nrow(glist),]
colnames(glist) = c("name", "NMname", "chr", "strand", "start", "end");
# write.table(glist[,c(3,5,6,1,2,4)], file="~/group/genes_enhancer_list/mm10RefFlat_glist.bed", row.names=F, col.names=F, sep='\t', quote=F)

SEglist = GRanges(glist) %>%
    filter( NMname %in% SEmm9$proximal_gene ) %>%
    mutate( name = paste0("GN-", name) ); 

SEs = SEmm9$ID;
SEgenes = SEglist$name;
query_SE = c(SEs, SEgenes);
length(unique(SEgenes))
length(unique(SEs))

In [None]:
SEcorr = chr_corrInp %>%
    filter( as.character(seqnames(expressedFeatures[Gene])) == as.character(seqnames(expressedFeatures[Enhancer] ))) %>%
    filter( Gene %in% query_SE & Enhancer %in% query_SE ) %>%
    filter(pGene >= 0.01 & pEnh >= 0.01 );
SEcorr$fdr_pos = p.adjust(SEcorr$emp_p_pos, method = "fdr");

In [None]:
SEcorr = SEcorr %>%
    filter( Gene %in% query_SE & Enhancer %in% query_SE ) %>%
    filter( emp_p_pos < 0.05 ) %>%
    filter(pGene >= 0.01 & pEnh >= 0.01 )#; %>%
    # filter( substr(Gene, 0, 5) != "GN-Gm");

length(unique(SEcorr$Gene))
length(unique(SEcorr$Enhancer))
# % of SEs that have a protein coding gene within the distance limit:
length(unique(SEcorr$Enhancer))/length(unique(SEs))*100

SEcorr