In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
# .libPaths(c("/home/mahat/.conda/envs/r422/lib/R/library",
#             "/net/bmc-lab2/data/lab/scgroseq/group/software/R/x86_64-pc-linux-gnu-library/4.2"))
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(matrixStats)
    library(foreach)
    library(doParallel)
    library(plyranges)
    library(viridis)
    library(ggpointdensity)
    library(Matrix)
    library(data.table)
});

In [None]:
registerDoParallel(16);
options(
    repr.plot.width=4,
    repr.plot.height=4,
    jupyter.plot_mimetypes = "image/svg+xml",
    digits=5
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");

# load features
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
features$name=NULL;
features$score=NULL;
features

In [None]:
# genes shorter than 1500 are: 2931
# genes shorter than 1000 are: 2209
# genes shorter than  500 are:  484
# genes shorter than  200 are:  147

# short Gm### genes
shortGm = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
    filter( width > 200 ) %>%
    filter( width < 1000 & substr(names, 0, 5) == "GN-Gm" );
length(shortGm)
summary(width(shortGm))
names(shortGm)[1:5]

# define highly expressed "blacklist" genes to be
# removed from correlations
blacklist = grep( "GN-Rp[ls]" , names(features) );
blacklist = grep( "GN-Sn[oh]", names(features) ) %>%
    append(blacklist);

blacklist = names(features)[ blacklist ];
length(blacklist)
summary(width(features[ blacklist ]))
blacklist[c(1:5, 110:115)]

In [None]:
# Specify feature length (half at the start and other half at the end)
# to be trimmed to eliminate the effect of paused Pol II at TSS and TES
trimEndLength =  1000;
# specify maximum gene length to be used for G-E correlation:
maxLength = 10000;

In [None]:
# filter genes
genes = features %>%
    filter( substr(names, 0, 3) == "GN-" ) %>%
# remove genes less than 200 nt (mostly Gm genes)
    filter( width >= 200 ) %>%
# filter genes with dREG peaks
    filter( names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
# filter highly expressed genes
    filter(!names %in% blacklist ) %>%
# filter Gm genes shorter than 1 kb
    filter(!names %in% names(shortGm) );

# trim 250 nt from either ends of genes that are longer than 1000
trimGenes = which( width(genes) >= 1000 );
genes[trimGenes] = genes[trimGenes] %>%
# truncate 500 bp in the front and 500 nt in the end of long genes
    anchor_center() %>%
    mutate( width = width - 500);

# trim additional 250 nt from either ends of genes that are now longer than 1000 (originally 1500)
trimGenes = which( width(genes) >= 1000 );
genes[trimGenes] = genes[trimGenes] %>%
# truncate 500 bp in the front and 500 nt in the end of long genes
    anchor_center() %>%
    mutate( width = width - 500);

# trim the long genes to 10 kb
longf = which( width(genes) >= 10000 );
genes[longf] = genes[longf] %>%
    resize( width = 10000, fix="start" );
length(genes)
# summary(width(genes %>% filter(width <1000)))
summary(width(genes))

In [None]:
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
counts = readRDS("../data/scGROv2p8_mapq3qc_max30kbp_filtered_counts.rds");
#counts = counts[,colSums(counts) >= 1000 ];
dim(counts)
allcells = colnames(counts);

In [None]:
table(substr(allcells, 0, 6));

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( mapq >= 3 & countQC & umiQC & plateQC & cellQC & miRQC ) %>%
    subsetByOverlaps( genes ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    filter( cellID %in% allcells ) %>%
    select( cellID );

scGRO

In [None]:
counts = foreach(
    id = allcells,
    .combine="cbind2"
) %dopar% {
    reads = scGRO %>% filter(cellID == id);
    # count reads in each feature for this cellID
    counts1 = countOverlaps( genes, reads );

    counts = cbind(counts1) %>%
        Matrix(sparse=T);

    return( counts );
}
# columns = cells = cellIDs
colnames(counts) = allcells;
rownames(counts) = names(genes);

In [None]:
data.frame(
    x=rowMeans(counts>0)
) %>%
    ggplot( aes(x=x) ) +
    geom_histogram(binwidth=0.05) +
    scale_x_log10() +
    ggtitle("Reads per gene") +
    xlab("Reads per cell") +
    ylab("Number of genes");

In [None]:
# only retain features with counts in 0.5% or more cells
observed = counts[ rowMeans(counts>0) >= 0.001, ];
dim(observed)

In [None]:
Niters = 1000;
Ncells = ncol(observed);
Ngenes = nrow(observed);
Nreads = colSums(observed);

In [None]:
# sampling probability = read count / total reads in each cell
bin1total = colSums(observed[, allcells]);
pmatrix1 = t(t(observed[, allcells]) / bin1total);

# average across cells
pvector1 = rowMeans(pmatrix1);

In [None]:
# Binarize observed counts
obsx = observed;
obsx[obsx>1] = 1;

# Multiply each Ngenes x Ncells matrix by its transpose.
# Because data is binary, this counts co-occurence of
# 1's among all genes pairs.
#obsx = obsx %*% t(obsx) / 3 / Ncells;

# implement chisq test
# compute average gene expression
obs_avg = rowMeans(obsx);
# compute expected coexpression (meanA * meanB)
AxB = obs_avg %*% t(obs_avg);
obsAB = as.matrix(obsx %*% t(obsx)) / Ncells;
obs_chisq = (obsAB-AxB)^2/AxB;
obs_chisq = obs_chisq * ifelse(obsAB>AxB, 1, -1);

dim(obsx)

In [None]:
# repeat each cellID by its read count
index1 = rep(1:Ncells, times=bin1total);
emp_p_pos = matrix(0, nrow=Ngenes, ncol=Ngenes);
emp_p_neg = matrix(0, nrow=Ngenes, ncol=Ngenes);
#emp_p     = matrix(0, nrow=Ngenes, ncol=Ngenes);
sim_chisq = matrix(0, nrow=Ngenes, ncol=Ngenes);

for( n in 1:Niters ) {
    if( n %% 100 == 0 ) {
        message(n);
    }
    
    # randomly sample from genes with replacement
    simx  = sample.int( Ngenes, size=sum(bin1total), replace=T, prob=pvector1);
    
    # assign sampled genes to cells based on read count
    simx = cbind(simx, index1);
    
    # binarize
    simx = unique(simx);
    simx = sparseMatrix(i=simx[,1], j=simx[,2], x=1, dims=c(Ngenes, Ncells));
    
    suppressWarnings({
        # compute coexpression
        #simx = simx %*% t(simx) / 3 / Ncells;
        # compare to observed coexpression
        #emp_p = emp_p + (2*simx >= obsx);

        # implement chisq test
        # compute average gene expression from this permutation
        sim_avg = rowMeans(simx);
        # compute every combination of coexpression
        AxB = sim_avg %*% t(sim_avg);
        simAB = as.matrix(simx %*% t(simx)) / Ncells;
        sim_chisq = (simAB-AxB)^2/AxB;
        sim_chisq = sim_chisq * ifelse(simAB>AxB, 1, -1);
        emp_p_pos = emp_p_pos + !(obs_chisq > sim_chisq);
        emp_p_neg = emp_p_neg + !(obs_chisq < sim_chisq);
        # chisq = (AB-A*B)^2/(A*B)
        # also add the old method of p-value by just observing the incidence of obsx smaller than simx
        #emp_p = emp_p + (simAB >= AxB);
    });
}

In [None]:
AxB = obs_avg %*% t(obs_avg);
chr_corr = data.frame(
    geneA = rep(1:Ngenes, each=Ngenes),
    geneB = rep(1:Ngenes, Ngenes),
    pA    = rep(obs_avg, each=Ngenes),
    pB    = rep(obs_avg, Ngenes),
    expAB = as.vector(AxB),
    pBoth = as.vector(obsAB),
    chisq = as.vector(obs_chisq),
    emp_p_pos = as.numeric(emp_p_pos) / Niters, # correlated cotranscription
    emp_p_neg = as.numeric(emp_p_neg) / Niters  # anticorrelated cotranscription
    #emp_p     = as.numeric(emp_p) / Niters  # anticorrelated cotranscription
);
# chr_corr = chr_corr[ chr_corr$geneA < chr_corr$geneB, ];
chr_corr$geneA = as.factor(rownames(observed)[chr_corr$geneA]);
chr_corr$geneB = as.factor(rownames(observed)[chr_corr$geneB]);

In [None]:
# fwrite(chr_corr, file="../data/scGROv2p8_mapq3qc_max10kbp_1Kpermuted_chisq_empp.csv.gz");

In [None]:
chr_corrInput = fread(file="../data/scGROv2p8_mapq3qc_max10kbp_1Kpermuted_chisq_empp.csv.gz");
dim(chr_corrInput)
dim(chr_corrInput)

In [None]:
chr_corr = chr_corrInput %>%
    filter(geneA < geneB) %>%
    filter(!geneA %in% blacklist & !geneB %in% blacklist) %>%
    filter(as.character(seqnames(features[geneA])) == as.character(seqnames(features[geneB]))) #%>%
    # filter(emp_p_pos <= 0.05);

In [None]:
# lookup gene attributes
geneB = features[ chr_corr$geneA ];
geneA = features[ chr_corr$geneB ];
chr_corr$lengthA = width(geneA);
chr_corr$lengthB = width(geneB);
chr_corr$strA    = as.character(strand(geneA));
chr_corr$strB    = as.character(strand(geneB));
chr_corr$fdr_pos = p.adjust(chr_corr$emp_p_pos, method = "fdr");

In [None]:
# compute distance between gene promoters
proA = resize(geneA, width=1, fix="start");
proB = resize(geneB, width=1, fix="start");
strand(proA) = "*";
strand(proB) = "*";
chr_corr$distance = width(pgap( proA, proB ));
sum(chr_corr$distance >= 2e6);

chr_corr = chr_corr %>% 
    filter(distance < 10e6);
nrow(chr_corr)

In [None]:
chr_corr %>%
    ggplot( aes(x=distance/1000)) +
    geom_histogram(binwidth=50, color="#445577", fill="#445577") +
    xlim(0, 10000) +
    ggtitle("Correlated genes on either strand") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");
ggsave(filename="../plots/Distance_between_allGenes_.pdf", width=4, height=4, units="in")

chr_corr %>%
    filter(emp_p_pos <= 0.05) %>%
    ggplot( aes(x=distance/1000)) +
    geom_histogram(binwidth=50, color="#445577", fill="#445577") +
    xlim(0, 10000) +
    ggtitle("Correlated genes on either strand") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");
ggsave(filename="../plots/Distance_between_corrGenes_.pdf", width=4, height=4, units="in")

In [None]:
chr_corr %>%
    filter( strA != strB ) %>%
    ggplot( aes(x=distance/1000)) +
    geom_histogram(binwidth=50, color="#445577", fill="#445577") +
    # scale_x_log10(c(0,2000)) +
    xlim(0, 10000) +
    ggtitle("Correlated genes on different strands") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");

In [None]:
chr_corr %>%
    filter( strA == strB ) %>%
    ggplot( aes(x=distance/1000)) +
    geom_histogram(binwidth=50, color="#445577", fill="#445577") +
    xlim(0, 10000) +
    ggtitle("Correlated genes on same strand") +
    xlab("Distance between pair (kb)") +
    ylab("Number of feature pairs");

In [None]:
chr_corr %>%
    filter( strA != strB & distance <= 50000 ) %>%
    ggplot( aes(x=distance, y=-log10(emp_p_pos)) ) +
    geom_point(size=0.5) +
    xlim(0, 50000) +
    scale_color_viridis() +
    ggtitle("Gene pairs on different strands") +
    xlab("Distance between gene TSS (bp)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( strA == strB & distance <= 50000 ) %>%
    ggplot( aes(x=distance, y=-log10(emp_p_pos)) ) +
    geom_point(size=0.5) +
    xlim(0, 50000) +
    scale_color_viridis() +
    ggtitle("Gene pairs on same strands") +
    xlab("Distance between gene TSS (bp)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( strA != strB ) %>%
    ggplot( aes(x=distance/1000, y=-log10(emp_p_pos)) ) +
    geom_pointdensity(size=0.5) +
    xlim(0, 500) +
    scale_color_viridis() +
    ggtitle("Gene pairs on different strands") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

In [None]:
chr_corr %>%
    filter( strA == strB ) %>%
    ggplot( aes(x=distance/1000, y=-log10(emp_p_pos)) ) +
    geom_pointdensity(size=0.5) +
    xlim(0, 500) +
    scale_color_viridis() +
    ggtitle("Gene pairs on same strands") +
    xlab("Distance between pair (kb)") +
    ylab("-log10 P");

In [None]:
dim(chr_corr)

In [None]:
rowMeans( observed[query,]>0 ) %>% t %>% t;