In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(dplyr)
    library(matrixStats)
    library(foreach)
    library(doParallel)
    library(plyranges)
    library(viridis)
    library(ggpointdensity)
    library(Matrix)
    library(data.table)
    library(Hmisc)
    library(gplots)
    library(rstatix)
    library(scattermore)
    library(igraph)
    library(ggraph)
    library(clusterProfiler)
    library(stringr)
    library(org.Mm.eg.db)
    library(circlize)
    library(gplots)
    library(RIdeogram)
    library(enrichplot)
    library(GOSemSim)
    library(dynamicTreeCut)
    library(WGCNA)
    library(pheatmap)
    library(seriation)
    library(dendextend)
    library(Seurat)
    library(DESeq2)
    library(lattice)
    # library(ComplexHeatmap)
});
source("./scGRO_functions.r");

In [None]:
registerDoParallel(16);
options(
    repr.plot.width=8,
    repr.plot.height=6,
    jupyter.plot_mimetypes = "image/svg+xml",
    digits=5
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
# load genes with dREG status for expression filter
genesWithdREGstatus = readRDS("../data/groHMM_dREG_refinedGenes_mES_mm10.rds");
genesWithdREGstatus
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");

# load features
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
# features = read_bed("../data/mES_BRsComb_dREGfiltered_features_customized_OSNenhancersPlusSEs_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_OSNenhancersPlusSEs_v2.bed");
names(features) = features$name;
# features$name=NULL;
features$score=NULL;
summary(width(features))
features

# truncate long features to 20 kb:
longf = which( width(features) > 20000 );
features[longf] = features[longf] %>%
    # filter( substr(names, 0, 3) == "GN=" ) %>%
    resize( width = 20000, fix="start" );
summary(width(features))
features

In [None]:
# OSNK = read.table("~/group/genes_enhancer_list/Custom_mES_OSN_v4.bed");
# OSNK
# test = OSNK[OSNK[,3] - OSNK[,2] < 0, ]
# test

In [None]:
# Oct4, Sox2, Nanog, Klf4 5k bins:
OSNK = read_bed("~/group/genes_enhancer_list/Custom_mES_OSN_v4.bed") %>%
    # filter out bins that are larger than 4:
    filter( as.numeric(sub("^.*_", "", name)) <= 4 ) %>%
    sort();
names(OSNK) = OSNK$name;
# customOSN$score = NULL;

name = c("INT_STITCHED_7784", "INT_STITCHED_1482", "INT_STITCHED_1973",
         "INT_STITCHED_3347", "INT_STITCHED_3348", "INT_STITCHED_3349")
assignedGene = c("GN-Pou5f1", "GN-Sox2", "GN-Klf4",
                 "GN-Nanog", "GN-Nanog", "GN-Nanog")
SElocation = c("Upstream", "Downstream", "Upstream",
               "Upstream", "Upstream", "Downstream")
OSNKassignment = data.frame(name, assignedGene, SElocation);
OSNKassignment

## SEs that were CRISPRed in Moorthy 2017 Genome Research paper:
vSE = read_bed("~/group/genes_enhancer_list/CRISPR_verified_SEs_Moorthy_2017.bed") %>%
    # filter out bins that are larger than 4:
    filter( as.numeric(sub("^.*_", "", name)) <= 4 ) %>%
    sort();
names(vSE) = vSE$name;
# verifiedSE$score = NULL;

name = c("INT_STITCHED_4179", "INT_STITCHED_5044", 
         "INT_STITCHED_2745", "INT_STITCHED_2175", "INT_STITCHED_5427", 
         "INT_STITCHED_1210", "INT_STITCHED_6113", "INT_STITCHED_6112", 
         "INT_STITCHED_4657", "INT_STITCHED_1626", "INT_STITCHED_746")
assignedGene = c("GN-Sall1", "GN-Tet1", 
                 "GN-Med13l", "GN-Macf1", "GN-Ranbp17", 
                "GN-Cbfa2t2", "GN-Esrrb", "GN-Esrrb", 
                 "GN-Ooep", "GN-Mcl1",  "GN-Etl4")
SElocation = c("Downstream", "Upstream", 
               "Upstream", "Intron", "Upstream", 
               "Upstream", "Intron", "Intron", 
               "Upstream", "Upstream", "Intron")
vSEassignment = data.frame(name, assignedGene, SElocation);
vSEassignment

# rbinding OSNK and vSE assignments
SEassignment = rbind(OSNKassignment, vSEassignment)
SEassignment

# SEs and Genes that do not have SEs in gene intron:
nonIntronSEs = SEassignment[SEassignment$SElocation != "Intron", 1]
width(features[nonIntronSEs])
unique(nonIntronSEs)
nonIntronGenes = SEassignment[SEassignment$SElocation != "Intron", 2]
width(features[nonIntronGenes])
unique(nonIntronGenes)

# Remove SEs and associated Genes where SE lies in Intron:
verifiedSE = suppressWarnings(c(OSNK, vSE)) %>%
    filter( sub("_[^_]+$", "", names) %in% c(nonIntronSEs, nonIntronGenes));
summary(width(verifiedSE))
verifiedSE

In [None]:
unique(seqnames(verifiedSE))
table(unique(sub("_[^_]+$", "", names(verifiedSE))))

In [None]:
# select only SEs, not genes:
SEonly = verifiedSE %>%
    filter( substr(names, 0, 3) == "INT" );
length(SEonly)

# get all features in the neighborhood of SEs
SEneighborsAll = features %>%
    # filter genes with dREG peaks
    filter(names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
    # ignoring the Gm genes:
    filter( !substr(names, 0, 5) == "GN-Gm" ) %>%
    subsetByOverlaps(SEonly, maxgap=5000000);
length(SEneighborsAll)

# get the distance between SEneighbors and SE
hits = distanceToNearest( SEneighborsAll, SEonly, ignore.strand=T );
# assign the SE to SEneighbors genes by taking name of hits@to
SEneighborsAll$SE = sub("_[^_]+$", "", names(SEonly)[hits@to]); 
# assign the distance between SEneighbors and SE with hits@elementMetadata$distance
SEneighborsAll$distance = hits@elementMetadata$distance;

SEneighbors = SEneighborsAll %>%
    filter(substr(names, 0, 3) == "GN-") %>%
    # filter SE assigned genes so that the bins don't run into these genes
    filter( !names %in% nonIntronGenes ) %>%
    # 20 kb for dividing into 4 5kb bins
    filter( width >= 20000 ) %>%
    filter( distance < 1000000 | distance > 4000000 ) %>%
    # add a column indicating if the gene is within TAD or outside TAD:
    mutate( TADstatus = ifelse(distance <= 1000000, "IN", "OUT"));

# filter for non-overlapping features:
SEneighbors <- SEneighbors %>%
    # reduce merges overlapping ranges and subsetting with "equal" only return that did not get merged
    subsetByOverlaps( reduce(SEneighbors), type = "equal" );

SEneighborsOthers = SEneighborsAll %>%
    filter( !names %in% names(SEneighbors) );
length(SEneighborsOthers)

summary(as.factor(SEneighbors$TADstatus))
length(unique(names(SEneighbors)))
length(unique(SEneighbors$SE))
SEneighbors

In [None]:
counts = readRDS("../data/scGROv2p8_mapq3qc_max20kbp_filtered_counts.rds");
#counts = counts[,colSums(counts) >= 1000 ];
dim(counts)
allcells = colnames(counts);

In [None]:
# Get the range of expression of vefifiedSEGenes:
verifiedSEgenesON = counts[ nonIntronGenes, ]
dim(verifiedSEgenesON)
summary(rowMeans(verifiedSEgenesON > 0))
summary(as.factor(colSums(verifiedSEgenesON > 0)))
# how many cells have at least 1 S-phase genes expressed:
verifiedSEgenesExpressingCells = ncol(verifiedSEgenesON[ ,colSums(verifiedSEgenesON) > 0])
verifiedSEgenesExpressingCells/ncol(counts)*100

# Subset SEneighbors Genes such that their expression is similar to verifiedSEgenes
SEneighborsON = counts[ names(SEneighbors), ]
dim(SEneighborsON)
summary(rowMeans(SEneighborsON > 0))
# filter SEneighbors with expression within the verfiedSEgenes:
SEneighborsON = SEneighborsON[rowMeans(SEneighborsON) >= min(rowMeans(verifiedSEgenesON > 0)) & 
                     rowMeans(SEneighborsON) <= max(rowMeans(verifiedSEgenesON > 0)), ]
dim(SEneighborsON)
summary(rowMeans(SEneighborsON > 0))

# how many SEneighbor genes inside and outside of TAD survive the expression filter?
test = SEneighbors %>%
    filter( names %in% rownames(SEneighborsON) );
summary(as.factor(test$TADstatus))

In [None]:
# create 5kbp time-bins of SEneighbors:
# remove bins if they overlap with SEneighborOthers and the previous bins
SEneighbors1 = resize(SEneighbors[rownames(SEneighborsON)], width=5000, fix="start") %>%
    subsetByOverlaps( SEneighborsOthers, invert = TRUE );
names(SEneighbors1) = paste0(SEneighbors1$name, "_1");
length(SEneighbors1)
# # 5 kb upstream region - divergent transcription
# SEneighborsDiv1 = shift_upstream(SEneighbors1, 5000) %>%
#     mutate( strand = ifelse(strand == "+", "-", "+") ) %>%
#     subsetByOverlaps( c(SEneighbors1), invert = TRUE );
# names(SEneighborsDiv1) = paste0(SEneighborsDiv1$name, "_1")

SEneighbors2 = shift_downstream(SEneighbors1, 5000) %>%
    subsetByOverlaps( c(SEneighborsOthers, SEneighbors1), invert = TRUE );
names(SEneighbors2) = paste0(SEneighbors2$name, "_2")
length(SEneighbors2)
SEneighbors3 = shift_downstream(SEneighbors2, 5000) %>%
    subsetByOverlaps( c(SEneighborsOthers, SEneighbors2), invert = TRUE );
names(SEneighbors3) = paste0(SEneighbors3$name, "_3")
length(SEneighbors3)
SEneighbors4 = shift_downstream(SEneighbors3, 5000) %>%
    subsetByOverlaps( c(SEneighborsOthers, SEneighbors3), invert = TRUE );
names(SEneighbors4) = paste0(SEneighbors4$name, "_4")
length(SEneighbors4)

In [None]:
# https://github.com/hbc/tinyatlas
# cell cycle marker genes:
cc_markers = fread("../data/Cell_cycle_marker_genes.csv");
# cc_markers[1:4, ]

s_genes = features %>%
    filter( sub("GN-", "", names) %in% cc_markers[cc_markers$phase == "S"]$gene_name ) %>%
    mutate( phase = "S" );
length(s_genes);

g2m_genes = features %>%
    filter( sub("GN-", "", names) %in% cc_markers[cc_markers$phase == "G2/M"]$gene_name ) %>%
    mutate( phase = "G2/M" );
length(g2m_genes);

ccMarkers = c(s_genes, g2m_genes)
summary(width(ccMarkers))
summary(as.factor(ccMarkers$phase))
ccMarkers

# Get the range of expression of s_phase genes:
s_genesON = as.matrix(counts[ names(s_genes), ])
summary(rowMeans(s_genesON > 0))
table(colSums(s_genesON > 0))
# filter cells that have at least 10% of S-phase genes expressed:
s_genesON = s_genesON[ ,colSums(s_genesON > 0) >= length(s_genes)/10 ]
# how many cells have at least 10% of S-phase genes expressed:
ncol(s_genesON)/ncol(counts)*100

# # converting into a binary matrix:
# s_genesON[s_genesON > 0] = 1;

# converting less than 2 to 0:
s_genesON[s_genesON < 2] = 0;

pdf('../plots/S_phase_genes_heatmap.pdf',width=20,height=20)
heatmap.2(s_genesON,
    dendrogram='column', scale='none',
    # Rowv=FALSE, Colv=FALSE,
    # col=gray.colors(1,0,n=2), col=cm.colors(2),
    col=colorRampPalette(c("white", "red"))(n = 2),
    trace='none', keysize=2, density.info='none',
    labRow=FALSE, labCol=FALSE)
dev.off()
s_genesONhm = pheatmap(s_genesON,
         cluster_cols = TRUE, 
         cluster_rows = TRUE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(15),
         border_color = "blue",
         cellwidth = 0.1,
         cellheight = 7.5,
         # cutree_cols = 6,
         # fontsize_col = 8,
         fontsize_row = 8)
# ggsave(s_genesONhm, filename = "../plots/S_phase_genes_heatmap.pdf", width=20, height=20);   

# Get the range of expression of g2/m_phase genes:
g2m_genesON = as.matrix(counts[ names(g2m_genes), ])
summary(rowMeans(g2m_genesON > 0))
table(colSums(g2m_genesON > 0))
# filter cells that have at least 10% of G2M-phase genes expressed:
# how many cells have at least 10% of G2M-phase genes expressed:
g2m_genesON = g2m_genesON[ ,colSums(g2m_genesON > 0) >= length(g2m_genes)/10 ]

# # converting into a binary matrix:
# g2m_genesON[g2m_genesON > 0] = 1;

# converting less than 2 to 0:
g2m_genesON[g2m_genesON < 2] = 0;

pdf('../plots/G2M_phase_genes_heatmap.pdf',width=20,height=20)
heatmap.2(g2m_genesON,
    dendrogram='column', scale='none',
    # Rowv=FALSE, Colv=FALSE,
    # col=gray.colors(1,0,n=2), col=cm.colors(2),
    col=colorRampPalette(c("white", "red"))(n = 10),
    trace='none', keysize=2, density.info='none',
    labRow=FALSE, labCol=FALSE)
dev.off()
g2m_genesONhm = pheatmap(g2m_genesON,
         cluster_cols = TRUE, 
         cluster_rows = TRUE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(15),
         border_color = "blue",
         cellwidth = 0.1,
         cellheight = 5,
         cutree_cols = 6,
         # fontsize_col = 8,
         fontsize_row = 8)
# ggsave(g2m_genesONhm, filename = "../plots/G2M_phase_genes_heatmap.pdf", width=20, height=20);

# from Long Cai intron seqFISH:
# "cells were sorted by G2/M marker gene mRNA levels (Aurka and Plk1)"
g2mCai_genes = features[ c("GN-Aurka", "GN-Plk1") ]

# # get all features in the neighborhood of ccMarker genes:
# ccNeighbors = features %>%
#     # filter genes with dREG peaks
#     filter(names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
#            !names %in% names(genesWithdREGstatus) ) %>%
#     # ignoring the Gm genes:
#     filter( !substr(names, 0, 5) == "GN-Gm" ) %>%
#     # only find neighbors within 20kb region:
#     subsetByOverlaps(ccMarkers, maxgap=20000) %>%
#     filter(substr(names, 0, 3) == "GN-") %>%
#     # filter SE assigned genes so that the bins don't run into these genes
#     filter( !names %in% names(ccMarkers) );
# ccNeighbors


# # create 5kbp time-bins of ccMarkers:
# # remove bins if they overlap with ccneighbors and the previous bins
# ccMarkers1 = resize( ccMarkers, width=5000, fix="start") %>%
#     subsetByOverlaps( ccNeighbors, invert = TRUE );
# names(ccMarkers1) = paste0(ccMarkers1$name, "_1");
# length(ccMarkers1)
# # # 5 kb upstream region - divergent transcription
# # SEneighborsDiv1 = shift_upstream(SEneighbors1, 5000) %>%
# #     mutate( strand = ifelse(strand == "+", "-", "+") ) %>%
# #     subsetByOverlaps( c(SEneighbors1), invert = TRUE );
# # names(SEneighborsDiv1) = paste0(SEneighborsDiv1$name, "_1")

# ccMarkers2 = shift_downstream(ccMarkers1, 5000) %>%
#     subsetByOverlaps( ccNeighbors, invert = TRUE );
# names(ccMarkers2) = paste0(ccMarkers2$name, "_2")
# length(ccMarkers2)
# ccMarkers3 = shift_downstream(ccMarkers2, 5000) %>%
#     subsetByOverlaps( ccNeighbors, invert = TRUE );
# names(ccMarkers3) = paste0(ccMarkers3$name, "_3")
# length(ccMarkers3)
# ccMarkers4 = shift_downstream(ccMarkers3, 5000) %>%
#     subsetByOverlaps( ccNeighbors, invert = TRUE );
# names(ccMarkers4) = paste0(ccMarkers4$name, "_4")
# length(ccMarkers4)

In [None]:
# histone genes:
histGenes = features %>%
    filter( substr(names, 0, 7) == "GN-Hist") #%>%
    # filter( names %in% rownames(counts[rowSums(counts) > 0,]) );
    # filter( names %in% rownames(counts[rowMeans(counts) > 0.001,]) );
summary(as.factor(seqnames(histGenes)))
summary(width(histGenes))

# Get the range of expression of histone genes:
histGenesON = as.matrix(counts[ names(histGenes), ])
dim(histGenesON)
summary(rowMeans(histGenesON > 0))
table(colSums(histGenesON > 0))
# how many cells have at least 1 histone genes expressed:
histoneExpressingCells = ncol(histGenesON[ ,colSums(histGenesON > 0) > 0])
histoneExpressingCells/ncol(counts)*100

# converting into a binary matrix:
histGenesON[histGenesON > 1] = 1;

# heatmap:
pdf('../plots/histoneGenes_heatmap.pdf',width=20,height=20)
heatmap.2(histGenesON,
    dendrogram='column', scale='none',
    # Rowv=FALSE, Colv=FALSE,
    # col=gray.colors(1,0,n=2), col=cm.colors(2),
    col=colorRampPalette(c("white", "red"))(n = 2),
    trace='none', keysize=2, density.info='none',
    labRow=FALSE, labCol=FALSE)
dev.off()

# truncate histone genes to 5 kb or 1st bin:
longf = which( width(histGenes) > 5000 );
histGenes[longf] = histGenes[longf] %>%
    resize( width = 5000, fix="start" );
summary(width(histGenes))
sum(width(histGenes))
histGenes

In [None]:
# NEW cell-cyle marker genes:

# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2600614/#pone.0003943.s001
G1_Beyrouthy = c("GN-Fos", "GN-Egr1", "GN-Atf3", "GN-Myadm") # "GN-Ctgf"
# https://www.nature.com/articles/s41467-022-30545-8
G1S_Riba = c("GN-Orc1", "GN-Ccne1", "GN-Ccne2", "GN-Mcm6")
G2M_Riba = c("GN-Wee1", "GN-Cdk1", "GN-Ccnf", "GN-Nusap1", "GN-Aurka", "GN-Ccna2") # "GN-Ccnb2"

ccMarkers = features[ c(G1_Beyrouthy, G1S_Riba, G2M_Riba) ]
width(ccMarkers)

# # https://www.molbiolcell.org/doi/epdf/10.1091/mbc.02-02-0030
# G1S_Whitfield = c("GN-Ccne1", "GN-Ccne2", "GN-Cdc25a", "GN-Cdc6", "GN-E2f1", "GN-Mcm2", "GN-Mcm6", "GN-Npat", "GN-Pcna", "GN-Slbp")
# S_Whitfield = c("GN-Brca1", "GN-Dhfr", "GN-Msh2", "GN-Rrm1", "GN-Rrm2", "GN-Tyms")
# # G2_Whitfield = c("GN-Ccna2", "GN-Ccnf", "GN-Top2a")
# G2M_Whitfield = c("GN-Birc5", "GN-Bub1", "GN-Bub1b", "GN-Ccnb1", "GN-Ccnb2", "GN-Cdc20", "GN-Cdc25b", "GN-Cenpa", "GN-Cks2", "GN-Plk1") #  "GN-Cdkn2d" - short and lost due to overlap,  "GN-Stk15" -- not present in mouse

# ccMarkers = features[ c(G1S_Whitfield, S_Whitfield, G2M_Whitfield) ]
# width(ccMarkers)

# get all features in the neighborhood of ccMarker genes:
ccNeighbors = features %>%
    # filter genes with dREG peaks
    filter(names %in% names(genesWithdREGstatus[genesWithdREGstatus$dREG == T]) | 
           !names %in% names(genesWithdREGstatus) ) %>%
    # ignoring the Gm genes:
    filter( !substr(names, 0, 5) == "GN-Gm" ) %>%
    # only find neighbors within 20kb region:
    subsetByOverlaps(ccMarkers, maxgap=20000) %>%
    filter(substr(names, 0, 3) == "GN-") %>%
    # filter SE assigned genes so that the bins don't run into these genes
    filter( !names %in% names(ccMarkers) );

# create 5kbp time-bins of ccMarkers:
# remove bins if they overlap with ccneighbors and the previous bins
ccMarkers1 = resize( ccMarkers, width=5000, fix="start") %>%
    subsetByOverlaps( ccNeighbors, invert = TRUE, minoverlap = 1000 );
names(ccMarkers1) = paste0(ccMarkers1$name, "_1");
length(ccMarkers1)
names(ccMarkers1)
ccMarkers1
# # 5 kb upstream region - divergent transcription
# SEneighborsDiv1 = shift_upstream(SEneighbors1, 5000) %>%
#     mutate( strand = ifelse(strand == "+", "-", "+") ) %>%
#     subsetByOverlaps( c(SEneighbors1), invert = TRUE );
# names(SEneighborsDiv1) = paste0(SEneighborsDiv1$name, "_1")

ccMarkers2 = shift_downstream(ccMarkers1, 5000) %>%
    subsetByOverlaps( ccNeighbors, invert = TRUE );
names(ccMarkers2) = paste0(ccMarkers2$name, "_2")
length(ccMarkers2)
ccMarkers3 = shift_downstream(ccMarkers2, 5000) %>%
    subsetByOverlaps( ccNeighbors, invert = TRUE );
names(ccMarkers3) = paste0(ccMarkers3$name, "_3")
length(ccMarkers3)
ccMarkers4 = shift_downstream(ccMarkers3, 5000) %>%
    subsetByOverlaps( ccNeighbors, invert = TRUE );
names(ccMarkers4) = paste0(ccMarkers4$name, "_4")
length(ccMarkers4)

In [None]:
sum(width(features[ G1_Beyrouthy ]))
sum(width(features[ G1S_Riba ]))
sum(width(features[ G2M_Riba ]))
sum(width(histGenes))

# sum(width(features[ G1S_Whitfield ]))
# sum(width(features[ S_Whitfield ]))
# sum(width(features[ G2M_Whitfield ]))

In [None]:
# combine features without histone genes:
# select longer than 1 kb and truncate 1 kb:
allfeatures = c(verifiedSE, 
                SEneighbors1, 
                SEneighbors2, 
                SEneighbors3, 
                SEneighbors4, 
                ccMarkers1,
                ccMarkers2,
                ccMarkers3,
                ccMarkers4) %>%
    filter( width > 1000 );
allfeatures

# truncate 1st kb of bin-1s:
bin1 = which( as.numeric(sub("^.*_", "", allfeatures$name)) == 1 );
allfeatures[bin1] = allfeatures[bin1] %>%
    anchor_3p() %>%
    mutate( width = width - 1000 );
summary(width(allfeatures))

# add histone genes, without truncating
allfeatures = c(allfeatures, histGenes)
summary(width(allfeatures))

In [None]:
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
# counts = readRDS("../data/scGROv2p8_mapq3qc_max20kbp_filtered_counts.rds");
# #counts = counts[,colSums(counts) >= 1000 ];
# dim(counts)
# allcells = colnames(counts);

In [None]:
table(substr(allcells, 0, 6));

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( mapq >= 3 & countQC & umiQC & plateQC & cellQC & miRQC ) %>%
    # filter( !(Exp == "Exp236" & Plate %in% paste0("c0", 5:8)) ) %>%
    # filter( !(Exp == "Exp260b") ) %>%
    subsetByOverlaps( allfeatures ) %>%
    # subsetByOverlaps( verifiedSE ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-") ) ) %>%
    filter( cellID %in% allcells ) #%>%
    # select( cellID );
scGRO

In [None]:
counts = foreach(
    id = allcells,
    .combine="cbind2"
) %dopar% {
    reads = scGRO %>% filter(cellID == id);
    # count reads in each feature for this cellID
    counts1 = countOverlaps( allfeatures, reads );
    # counts1 = countOverlaps( verifiedSE, reads );

    counts = cbind(counts1) %>%
        Matrix(sparse=T);

    return( counts );
}
# columns = cells = cellIDs
colnames(counts) = allcells;
rownames(counts) = names(allfeatures);
# rownames(counts) = names(verifiedSE);
dim(counts)

In [None]:
# to sum rows with same rownames
# the enhancers at various bins have same names
counts = rowsum(counts, row.names(counts))
dim(counts)
# rowSums(counts)
# colSums(counts)

In [None]:
newCounts = counts
# remove bins ID in row names, so that the counts in the bins of same genes can be added:
row.names(newCounts) = sub("_[^_]+$", "", row.names(newCounts))
dim(newCounts)
# add counts from the same gene but different bins together:
newCounts = rowsum(newCounts, row.names(newCounts))
dim(newCounts)

In [None]:
# trying to see if G1S, Histone genes, and G2M genes expressing cells are distinct:
G1S_Hist_G2M = as.matrix(newCounts[ c(G1_Beyrouthy, G1S_Riba, names(histGenes), G2M_Riba), ])
# converting into a binary matrix:
G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;
dim(G1S_Hist_G2M)
# Create the heatmap using pheatmap
# https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
hmap = pheatmap(G1S_Hist_G2M,
         cluster_cols = TRUE, 
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(2),
         border_color = NA,
         # cellwidth = 3,
         # cellheight = 3
         # fontsize_col = 8,
         fontsize_row = 3)
ggsave(hmap, filename = "../plots/G1S_HistoneGenes_G2M_heatmap.pdf", width=20, height=20);

In [None]:
emptyCells = G1S_Hist_G2M[ ,colSums(G1S_Hist_G2M) == 0]
dim(emptyCells)

emptyCells = counts[ ,colnames(emptyCells)]
dim(emptyCells)

new = c(allCells = colSums(counts), emptyCells = colSums(emptyCells))
dim(new)
head(new)

summary(colSums(counts))
summary(colSums(emptyCells))

hist(colSums(counts))
hist(colSums(emptyCells))
# new %>%
#     # filter(N_bursts >= 10 & burst_size > 1) %>%
#     ggplot(aes(x=Type, y = burst_rate, fill = Type)) +
#     geom_violin(show.legend = F, draw_quantiles = c(0.25, 0.5, 0.75)) +
#     #geom_jitter(height = 0, width = 0.4, alpha=0.1, size=0.5) +
#     scale_y_log10(limits = c(0.05, 5)) +
#     ggtitle("Bursts from 10kb gene body") +
#     ylab("Bursts per hour") +
#     guides(fill=guide_legend(title=""))

In [None]:
# # trying to see if G1S and G2M genes expressing cells are distinct:
# G1S_SplusHist_G2M = as.matrix(newCounts[ c(G1S_Whitfield, S_Whitfield, names(histGenes), G2M_Whitfield), ])

# # provide rownames reflecting their origin for all rows
# rownames(G1S_SplusHist_G2M) = c(rep("G1S", each=length(G1S_Whitfield)), rep("S", each=(length(S_Whitfield) + length(names(histGenes)))), rep("G2M", each=length(G2M_Whitfield)));
# dim(G1S_SplusHist_G2M)

# # # converting into a binary matrix:
# # G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;

# # perform rowsums by rownames
# G1S_SplusHist_G2M = rowsum(G1S_SplusHist_G2M, row.names(G1S_SplusHist_G2M))
# dim(G1S_SplusHist_G2M)     

# # R script that filters columns in a matrix so that 
# # the only columns with a value of more than 1 in any rows are retained:
# G1S_SplusHist_G2M = G1S_SplusHist_G2M[, apply(G1S_SplusHist_G2M, 2, function(x) any(x > 1)) ]
                                    
# # change the row order of the matrix:
# G1S_SplusHist_G2M = G1S_SplusHist_G2M[c(1,3,2),]
# dim(G1S_SplusHist_G2M)
# G1S_SplusHist_G2M[1:3,1:10]
                                                    
# # # converting into a binary matrix:
# # G1S_Hist_G2M_binary = G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;
# # G1S_Hist_G2M_binary[1:4,1:10] 

# # # Convert cells with less than 2 counts in 3rd row [ HIS ], to 0:
# # # Can't perform this operation for 1 row, so did (3,3)
# # G1S_Hist_G2M[c(3,3), ] = apply(G1S_Hist_G2M[c(3,3), ], 2, function(x) ifelse(x < 3, 0, x))
# # dim(G1S_Hist_G2M)
# # G1S_Hist_G2M[1:3,1:10]
                           
# # Create the heatmap using pheatmap
# # https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
# hm = pheatmap(G1S_SplusHist_G2M,
#          clustering_distance_cols="euclidean",
#          clustering_method="complete",
#          cluster_cols = TRUE, 
#          cluster_rows = FALSE,
#          scale = "none",
#          show_rownames = TRUE,
#          show_colnames = FALSE,
#          color = colorRampPalette(c("white", "red"))(10),
#          border_color = "blue",
#          cellwidth = 0.3,
#          cellheight = 20,
#          # cutree_cols = 6,
#          # fontsize_col = 8,
#          fontsize_row = 8)
# ggsave(hm, filename = "../plots/G1S_SplusHistoneGenes_G2M_combined_heatmap.pdf", width=20, height=20);   

In [None]:
# trying to see if G1S and G2M genes expressing cells are distinct:
G1_G1S_Hist_G2M = as.matrix(newCounts[ c(G1_Beyrouthy, G1S_Riba, names(histGenes), G2M_Riba), ])

# provide rownames reflecting their origin for all rows
rownames(G1_G1S_Hist_G2M) = c(rep("G1", each=length(G1_Beyrouthy)), rep("G1/S", each=length(G1S_Riba)), rep("S", each=length(names(histGenes))), rep("G2/M", each=length(G2M_Riba)));
dim(G1_G1S_Hist_G2M)

# # converting into a binary matrix:
# G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;

# perform rowsums by rownames
G1_G1S_Hist_G2M = rowsum(G1_G1S_Hist_G2M, row.names(G1_G1S_Hist_G2M))
dim(G1_G1S_Hist_G2M)     

# R script that filters columns in a matrix so that 
# the only columns with a value of more than 1 in any rows are retained:
G1_G1S_Hist_G2M = G1_G1S_Hist_G2M[, apply(G1_G1S_Hist_G2M, 2, function(x) any(x > 1)) ]
                                    
# change the row order of the matrix:
G1_G1S_Hist_G2M = G1_G1S_Hist_G2M[c(1,2,4,3),]
dim(G1_G1S_Hist_G2M)
G1_G1S_Hist_G2M[1:4,1:10]
                                                    
# # converting into a binary matrix:
# G1S_Hist_G2M_binary = G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;
# G1S_Hist_G2M_binary[1:4,1:10] 

# # Convert cells with less than 2 counts in 3rd row [ HIS ], to 0:
# # Can't perform this operation for 1 row, so did (3,3)
# G1S_Hist_G2M[c(3,3), ] = apply(G1S_Hist_G2M[c(3,3), ], 2, function(x) ifelse(x < 3, 0, x))
# dim(G1S_Hist_G2M)
# G1S_Hist_G2M[1:3,1:10]
                           
# Create the heatmap using pheatmap
# https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
hm = pheatmap(G1_G1S_Hist_G2M,
         clustering_distance_cols="euclidean",
         clustering_method="complete",
         cluster_cols = TRUE, 
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(10),
         border_color = "blue",
         cellwidth = 0.5,
         cellheight = 20,
         # cutree_cols = 6,
         # fontsize_col = 8,
         fontsize_row = 8)
ggsave(hm, filename = "../plots/G1_G1S_HistoneGenes_G2M_combined_heatmap.pdf", width=20, height=20);   

In [None]:
# trying to see if G1S and G2M genes expressing cells are distinct:
G1S_Hist_G2M = as.matrix(newCounts[ c(G1S_Riba, names(histGenes), G2M_Riba), ])

# provide rownames reflecting their origin for all rows
rownames(G1S_Hist_G2M) = c(rep("G1/S", each=length(G1S_Riba)), rep("S", each=length(names(histGenes))), rep("G2/M", each=length(G2M_Riba)));
dim(G1S_Hist_G2M)

# # converting into a binary matrix:
# G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;

# perform rowsums by rownames
G1S_Hist_G2M = rowsum(G1S_Hist_G2M, row.names(G1S_Hist_G2M))
dim(G1S_Hist_G2M)     

# R script that filters columns in a matrix so that 
# the only columns with a value of more than 1 in any rows are retained:
G1S_Hist_G2M = G1S_Hist_G2M[, apply(G1S_Hist_G2M, 2, function(x) any(x > 1)) ]
                                    
# change the row order of the matrix:
G1S_Hist_G2M = G1S_Hist_G2M[c(1,3,2),]
dim(G1S_Hist_G2M)
G1S_Hist_G2M[1:3,1:10]
                                                    
# # converting into a binary matrix:
# G1S_Hist_G2M_binary = G1S_Hist_G2M[G1S_Hist_G2M > 1] = 1;
# G1S_Hist_G2M_binary[1:4,1:10] 

# # Convert cells with less than 2 counts in 3rd row [ HIS ], to 0:
# # Can't perform this operation for 1 row, so did (3,3)
# G1S_Hist_G2M[c(3,3), ] = apply(G1S_Hist_G2M[c(3,3), ], 2, function(x) ifelse(x < 3, 0, x))
# dim(G1S_Hist_G2M)
# G1S_Hist_G2M[1:3,1:10]
                           
# Create the heatmap using pheatmap
# https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
hm = pheatmap(G1S_Hist_G2M,
         clustering_distance_cols="euclidean",
         clustering_method="complete",
         cluster_cols = TRUE, 
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(10),
         border_color = "blue",
         cellwidth = 0.5,
         cellheight = 20,
         # cutree_cols = 6,
         # fontsize_col = 8,
         fontsize_row = 8)
ggsave(hm, filename = "../plots/G1S_HistoneGenes_G2M_combined_heatmap.pdf", width=20, height=20);   

In [None]:
# # circular heatmap:
# # ordering the column by the clustered hm, since circlize does not cluster using columns:
# hmCirc = G1S_Hist_G2M[ ,hm$tree_col$order];
# hmCirc[1:3, 1:4]
# col_fun1 = colorRamp2(c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), c('white','#FFFFFF','#FFECEC','#FFDADA','#FFC8C8','#FFB6B6','#FFA3A3','#FF9191','#FF7F7F','#FF6D6D','#FF5B5B','#FF4848','#FF3636','#FF2424','#FF1212','#FF0000'))
# circos.heatmap(hmCirc, col = col_fun1)
# circos.clear()

In [None]:
# to reverse the order of columns in heatmap & mirror the heatmap above:
col_dend = hm[[2]]
# col_dend = rotate(col_dend, order = rev(colnames(G1_G1S_Hist_G2M)[get_order(col_dend)]))
col_dend = rotate(col_dend, order = rev(colnames(G1S_Hist_G2M)[get_order(col_dend)]))

# # change the row order of the matrix:
# G1_G1S_Hist_G2M = G1_G1S_Hist_G2M[c(2,3,4,1),]
# dim(G1_G1S_Hist_G2M)
# G1_G1S_Hist_G2M[1:4,1:10]
                               
hmMirror = pheatmap(G1S_Hist_G2M, #G1_G1S_Hist_G2M,
         cluster_cols = as.hclust(col_dend),
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(10),
         border_color = "none",
         cellwidth = 0.5,
         cellheight = 20,
         cutree_cols = 6.25,
         # fontsize_col = 8,
         fontsize_row = 8)
ggsave(hmMirror, filename = "../plots/G1S_HistoneGenes_G2M_combined_mirror_heatmap.pdf", width=8, height=2); 

In [None]:
# plot dendogram:
plot(hmMirror$tree_col)
abline(h=6.25, col="red", lty=2, lwd=2)

# cut hm column dendogram at height=7 (which results in 6 clusters)
clusters = cutree(hmMirror$tree_col, h=6.25)
table(clusters)
# column IDs that belong to cluster 1
clust1 = clusters[clusters == 1]
str(clust1)
clust1[1:4]
length(clust1)
clust2 = clusters[clusters == 2]
length(clust2)
clust3 = clusters[clusters == 3]
length(clust3)
clust4 = clusters[clusters == 4]
length(clust4)
# clust4
clust5 = clusters[clusters == 5]
length(clust5)
# clust5
clust6 = clusters[clusters == 6]
length(clust6)

In [None]:
# Examine reads per cell in G1S, S and G2M cells:
# Use original count matrix:
fullLengthCounts = readRDS("../data/scGROv2p8_mapq3qc_filtered_counts.rds");

G2Mcells = as.matrix(fullLengthCounts[ ,names(clust1) ])
dim(G2Mcells)
G1Scells = as.matrix(fullLengthCounts[ ,names(clust2) ])
dim(G1Scells)
Scells = as.matrix(fullLengthCounts[ ,names(clust3) ])
dim(Scells)

# dataframe for plotting:
CCreadCounts = data.frame(
    class = c(rep("G2/M", ncol(G2Mcells)), rep("G1/S", ncol(G1Scells)), rep("S", ncol(Scells))),
    ReadCounts = c(colSums(G2Mcells), colSums(G1Scells), colSums(Scells)));
CCreadCounts[1:4, ]
# t-test:
t.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G1/S","G2/M"), ]);
t.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G1/S","S"), ]);
t.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G2/M","S"), ]);

wilcox.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G1/S","G2/M"), ]);
wilcox.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G1/S","S"), ]);
wilcox.test(ReadCounts ~ class, CCreadCounts[CCreadCounts$class %in% c("G2/M","S"), ]);

# I reorder the groups order : I change the order of the factor data$names
CCreadCounts$class = factor(CCreadCounts$class, levels=c("G1/S", "S", "G2/M"))

# plot boxplot
ggplot(CCreadCounts, aes(x=class, y=ReadCounts)) +
    geom_boxplot(outlier.shape = NA, color=c("#ebc14c", "#265369", "#545756"), lwd=1.5) + #, rep("#818664", 2))) + #333333
    labs(x="", y="Reads per cell", ) +
       # title="Percentage of Cells in Each Phase",
    coord_cartesian(ylim = c(0, 10000))
ggsave(filename="../plots/CellCycle_readCounts_boxplot.pdf", width=4, height=4, units="in")

In [None]:
median(CCreadCounts[CCreadCounts$class == "G1/S", ]$ReadCounts)
median(CCreadCounts[CCreadCounts$class == "S", ]$ReadCounts)
median(CCreadCounts[CCreadCounts$class == "G2/M", ]$ReadCounts)

In [None]:
# donut plot to show the number of cells in various cell cycles:
CCnumbers = data.frame(count = c(length(clust2), length(clust3), length(clust1)), 
                       category = c("G1/S", "S", "G2/M"))
                    
# Compute percentages
CCnumbers$fraction = CCnumbers$count / sum(CCnumbers$count)
# Compute the cumulative percentages (top of each rectangle)
CCnumbers$ymax = cumsum(CCnumbers$fraction)
# Compute the bottom of each rectangle
CCnumbers$ymin = c(0, head(CCnumbers$ymax, n=-1))
CCnumbers

# I reorder the groups order : I change the order of the factor data$names
CCnumbers$category = factor(CCnumbers$category, levels=c("G1/S", "S", "G2/M"))

# Make the plot
ggplot(CCnumbers, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
    geom_rect() +
    # Try to remove that to understand how the chart is built initially
    coord_polar(theta="y") + 
    # Try to remove that to see how to make a pie chart
    xlim(c(2, 4)) +
    theme_void() +
    theme(legend.position = c(0.5, 0.5), 
          legend.title=element_blank(),
          legend.text = element_text(size=14, face="bold"),
          legend.key.size = unit(1, 'cm')) +
    scale_fill_manual(values = c("#ebc14c", "#265369", "#545756"))

ggsave(filename="../plots/CellCycle_fraction_of_cells_in_different_phases.pdf", width=4, height=4, units="in")

## Find differentially expressed genes in cell cycles:

In [None]:
# create a matrix with cells from 
CCgeneExp = as.matrix(cbind(fullLengthCounts[ ,names(clust3) ], fullLengthCounts[ ,names(clust2) ], fullLengthCounts[ ,names(clust1) ]));

# # binarize the data
# CCgeneExp[CCgeneExp>1] = 1;

# select genes only
CCgeneExp = CCgeneExp[ substr(rownames(CCgeneExp), 0, 3) == "GN-", ]
# remove "GN-"
rownames(CCgeneExp) = sub("GN-", "", rownames(CCgeneExp))
rownames(CCgeneExp)[1:4]

# find rownames that contain "Hist" as part of the names and convert all of them to "Histones
rownames(CCgeneExp)[grepl("Hist", rownames(CCgeneExp))] <- "Histones"
dim(CCgeneExp)

# sum rows that have the same names, in this case Histones
CCgeneExp = rowsum(CCgeneExp, row.names(CCgeneExp))
dim(CCgeneExp)

In [None]:
# create Seurat object:
s_obj = CreateSeuratObject( counts = CCgeneExp)
s_obj@assays
s_obj

# change the orig.ident from "SeuratObject' to cell cycle stages
s_obj@meta.data$orig.ident = as.factor(c(rep("G1/S", length(clust3)), rep("S", length(clust2)), rep("G2/M", length(clust1))))
# Set the identity of your cells to the desired column
Idents(object = s_obj) = s_obj@meta.data$'orig.ident'
s_obj@meta.data[1:4, ]

# normalize the data
s_obj_norm = ScaleData(s_obj)
    
# markers differentially expressed in each identity group by comparing it to all of the others
CCdiffGenes = FindAllMarkers(s_obj_norm, min.pct = 0.1, test.use = "poisson", min.diff.pct = 0.1, logfc.threshold = 0.2, only.pos = TRUE)

In [None]:
CCdiffGenesF = CCdiffGenes %>%
    # filter on log2 fold change or the genes used to classify cell cycles in the first place:
    filter( avg_log2FC >= 0.33 | rownames(CCdiffGenes) %in% c(G1S_Riba, G2M_Riba));
dim(CCdiffGenesF)
CCdiffGenesF[1:4]
summary(CCdiffGenesF$cluster)
fwrite(CCdiffGenesF, file="../data/CellCycle_genes_discovered_by_Seurat_FindAllFeatures.csv")

In [None]:
# z-score plot of differentially expressed genes:

CCgeneMeanExp = data.frame(
    name = rownames(G2Mcells),
    G1Scells = rowMeans(G1Scells > 0),
    Scells = rowMeans(Scells > 0),
    G2Mcells = rowMeans(G2Mcells > 0));

CCgeneMeanExp = CCgeneMeanExp %>%
    filter( substr(name, 0, 3) == "GN-" )
rownames(CCgeneMeanExp) = sub("GN-", "", CCgeneMeanExp$name)
CCgeneMeanExp$name = NULL
dim(CCgeneMeanExp)
CCgeneMeanExp[1:4, ]

# convert into a matrix for changing rownames and summing by rownames 
CCgeneMeanExp = as.matrix(CCgeneMeanExp)
# find rownames that contain "Hist" as part of the names and convert all of them to "Histones
rownames(CCgeneMeanExp)[grepl("Hist", rownames(CCgeneMeanExp))] <- "Histones"
# sum rows that have the same names, in this case Histones
CCgeneMeanExp = rowsum(CCgeneMeanExp, row.names(CCgeneMeanExp))
dim(CCgeneMeanExp)

# Filter CCgeneMeanExp for genes differentially expressed genes in cell-cycle:
CCgeneMeanExp = CCgeneMeanExp[rownames(CCgeneMeanExp) %in% rownames(CCdiffGenesF), ]
dim(CCgeneMeanExp)

# calculate z-score
CCgeneMeanExp = t(apply(CCgeneMeanExp, 1, scale))
colnames(CCgeneMeanExp) = colnames(CCgeneMeanExp)
CCgeneMeanExp

mycol = colorRampPalette(c('#4098b5', '#72cbe8', 'white', '#e88f72', '#b55c40'))(39) #
mybreaks = c(seq(-2, -1.1, length=10), seq(-1, 1, length=20), seq(1.1, 2, length=10))

pdf("../plots/cell_cycle_FinadAllMarkers_MeanExpressingCells_zscore_hm2.pdf", width = 4, height = 16)
heatmap.2(CCgeneMeanExp, 
          dendrogram=c("row"), 
          Rowv=T, 
          Colv=F, 
          scale="none", 
          sepwidth=c(0.01,0.01), 
          sepcolor="grey", 
          rowsep=0:nrow(CCgeneMeanExp), 
          colsep=0:ncol(CCgeneMeanExp), 
          key=T, 
          density.info="none", 
          trace=c("none"), 
          cexCol=0.9, 
          labRow=rownames(CCgeneMeanExp), 
          labCol=colnames(CCgeneMeanExp), 
          breaks = mybreaks, 
          col = mycol,  
          margins = c(12, 8))
dev.off() 

In [None]:
G1S_Riba = c("Orc1", "Ccne1", "Ccne2", "Mcm6")
G2M_Riba = c("Wee1", "Cdk1", "Ccnf", "Nusap1", "Aurka", "Ccna2", "Ccnb2")
rownames(CCgeneMeanExp[rownames(CCgeneMeanExp) %in% G1S_Riba, ])
rownames(CCgeneMeanExp[rownames(CCgeneMeanExp) %in% G2M_Riba, ])

# https://github.com/hbc/tinyatlas
cc_markers = fread("../data/Cell_cycle_marker_genes.csv");
length(cc_markers$gene_name)
cc_markers[cc_markers$gene_name %in% rownames(CCgeneMeanExp)]$gene_name

In [None]:
#Load libraries needed for this script

# test = cbind(A1[,9], A2[,9], B1[,9], B2[,9])
# # Create a talbe with six columns which contains the gene body read counts.

# row.names(test) = A1[,5]
# # gene names are assigned as row names.

# colnames(test) = c("A1", "A2", "B1", "B2")
# # specifying column names.


# testDesign = data.frame(row.names = colnames(test), condition = as.factor(c(rep("untreated",2),rep("treated",2))))
# # make the metadata table (this is well explained in the PDF)

# print(testDesign)
# # provides information about the metadata.

# print("this is how the table looks before normalization")
# print(head(test))
# # shows how the test file looks like.

# cds = DESeqDataSetFromMatrix(countData = test, colData = testDesign, design = ~ condition)
# # since test is a matrix of counts and the column information, we can construct a DESeqDataSet using DESeqDataSetFromMatrix function

# print(cds)
# # gives info about cds

# cds$condition = relevel(cds$condition, "untreated")
# # It is important to supply levels (otherwise the levels are chosen inalphabetical order) and to put the “control” or “untreated” level as the first element (”base level”), so that the log2 fold changes produced by default will be the expected comparison against the base level

# sizeFactors(cds) = c(nf1, nf2, nf3, nf4)
# # instead of using the estimateSizeFactors to allow DESeq to normalize the data, I input my normalization factor calculated from the reads that map to the drosophila genome.

# print("the normalization factors are:")
# print(sizeFactors(cds))

# print("this is how the table looks after normalization")
# print(head(counts(cds,normalized=TRUE)))
# # to visualize the normalized read counts.


# DESeq_cds = DESeq(cds)
# # this is the function that performs DESeq. Using the supplied sizeFactors, it estimates dispersions, gene-wise dispersion estimates, mean-dispersion relationship, final dispersion estimates, and performs fitting model and testing

# res = results(DESeq_cds)
# # table with the necessary informations like: baseMean log2FoldChange     lfcSE       stat      pvalue      padj

# sigGenes = res[(res$padj < padjusted) & !is.na(res$padj),]
# # this subsets the rows based on the supplied adjusted pvalue and the absence of NA values

# print(" the adjuseted p value to be used to filter significantly changed genes from DESeq2 is:")
# print(padjusted)

# print("# of significantly changed genes with given adj-pValue:")
# print(nrow(sigGenes))

# pdf(file = paste("PC_KPC001S_inVitro_vs_PC_HY19636_inVitro_Volcanoplot", ".pdf", sep = ''), width = 8, height = 12, bg = 'white')
# par(font.axis = 2)
# par(font.lab=2)
# par(cex.lab=1.3)
# par(cex.axis=1.3)
# par(cex=2)
# par(cex.main=1.3)
# par(mar=c(5,5,4,2))
# plot(res[,2], -log10(res[,6]), pch=20, main="PC_KPC001S_inVitro_vs_PC_HY19636_inVitro", xlab="Log2 [ Fold Change ]", ylab="-Log10 [ p_adj ]")
# points(sigGenes[,2], -log10(sigGenes[,6]), pch=20, col="red")
# dev.off()
# # making volcano plot

# write.table(sigGenes, file = "DESeq2-output.txt", row.names=T, sep='\t', quote=F, na="")
# # write the data

In [None]:
# trying to see if SEs and SEgenes are expressed in G1S, S, or G2M phase:
SEsCollapsed = as.matrix(newCounts[ c("INT_STITCHED_4179", "INT_STITCHED_5044", "INT_STITCHED_2745", 
                                      "INT_STITCHED_5427", "INT_STITCHED_1210", "INT_STITCHED_4657", 
                                      "INT_STITCHED_1626",
                                      "GN-Sall1", "GN-Tet1", "GN-Med13l", 
                                      "GN-Ranbp17", "GN-Cbfa2t2", "GN-Ooep", 
                                      "GN-Mcl1"), ])
# SEsCollapsed = as.matrix(newCounts[ c(vSEassignment$name, vSEassignment$assignedGene), ])


# provide rownames reflecting their origin for all rows
rownames(SEsCollapsed) = c(rep("SEs", each=7),
                           rep("SEsgenes", each=7));
dim(SEsCollapsed)
# # converting into a binary matrix:
# SEsCollapsed[SEsCollapsed > 1] = 1;

# perform rowsums by rownames
SEsCollapsed = rowsum(SEsCollapsed, row.names(SEsCollapsed))
dim(SEsCollapsed)
# order the columns by the column order of hm above
SEsCollapsed = SEsCollapsed[ ,hmMirror$tree_col$order];
dim(SEsCollapsed)
SEsCollapsed[1:2, 1:10]

# convert all values below 2 to 0:
SEsCollapsed[SEsCollapsed < 2] = 0;

# Create the heatmap using pheatmap
# https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
hm2 = pheatmap(SEsCollapsed,
         cluster_cols = FALSE, 
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(15),
         border_color = NA,
         cellwidth = 0.5,
         cellheight = 20,
         # fontsize_col = 8,
         fontsize_row = 8)
ggsave(hm2, filename = "../plots/verifiedSEs_heatmap.pdf", width=20, height=20);

In [None]:
# trying to see if SEs and SEgenes are expressed in G1S, S, or G2M phase:
OSNKCollapsed = as.matrix(newCounts[ c("INT_STITCHED_7784", "INT_STITCHED_1482", "INT_STITCHED_3347", 
                                       "INT_STITCHED_3348", "INT_STITCHED_3349", "INT_STITCHED_1973",
                                      "GN-Pou5f1", "GN-Sox2", "GN-Nanog", "GN-Klf4"), ])

# provide rownames reflecting their origin for all rows
rownames(OSNKCollapsed) = c(rep("OSNKSEs", each=6),
                               rep("OSNKsgenes", each=4));
dim(OSNKCollapsed)
# # converting into a binary matrix:
# SEsCollapsed[SEsCollapsed > 1] = 1;

# perform rowsums by rownames
OSNKCollapsed = rowsum(OSNKCollapsed, row.names(OSNKCollapsed))
dim(OSNKCollapsed)
# order the columns by the column order of hm above
OSNKCollapsed = OSNKCollapsed[ ,hmMirror$tree_col$order];
dim(OSNKCollapsed)
OSNKCollapsed[1:2, 1:10]

# convert all values below 2 to 0:
OSNKCollapsed[OSNKCollapsed < 2] = 0;

# Create the heatmap using pheatmap
# https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/
hm2 = pheatmap(OSNKCollapsed,
         cluster_cols = FALSE, 
         cluster_rows = FALSE,
         scale = "none",
         show_rownames = TRUE,
         show_colnames = FALSE,
         color = colorRampPalette(c("white", "red"))(15),
         border_color = NA,
         cellwidth = 0.5,
         cellheight = 20,
         # fontsize_col = 8,
         fontsize_row = 8)
ggsave(hm2, filename = "../plots/OSNKSEs_heatmap.pdf", width=20, height=20);

In [None]:
data.frame(
    x=rowMeans(counts>0)
) %>%
    ggplot( aes(x=x) ) +
    geom_histogram(binwidth=0.05) +
    scale_x_log10() +
    ggtitle("Reads per Feature") +
    xlab("Reads per cell") +
    ylab("Number of features");

In [None]:
# only retain features with counts in 0.5% or more cells
observed = counts[ rowMeans(counts>0) >= 0.005, ];
dim(observed)

test = substr(rownames(observed), 0, 3)
summary(as.factor(test))

## ExG Correlation:

In [None]:
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsx = as.matrix(t(observed))
dim(obsx)
obsx[1:5,1:5]

In [None]:
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix = function(cormat, pmat) {
  ut = upper.tri(cormat)
  data.frame(
    Gene = rownames(cormat)[row(cormat)[ut]],
    Enhancer = rownames(cormat)[col(cormat)[ut]],
    corr = (cormat)[ut],
    pVal = pmat[ut]
    )
}

#### Pearson correlation on non-binarized data:

In [None]:
# use hmisc package to calculate correlation and p-value:
corrP = rcorr(obsx, type = "pearson")

# corrPflat = flattenCorrMatrix( corrP$r, corrP$P ) %>%
#     filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) != "GN-" ) %>%
#     # filter rows such that both Gene and Enhancer were assigned pairs (based on same row in SEcenter):
#     filter( paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
#     # filter(as.character(seqnames(SEcenter[Gene])) == as.character(seqnames(SEcenter[Enhancer])));
#     mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
#     mutate( corrRank = rank(corr) ) %>%
#     mutate( pValRank = rank(pVal) );

# fwrite(corrPflat, file="../data/scGROv2p8_SE_CRISPRverified_5Kbins_pearson_correlation.csv.gz")
# dim(corrPflat)
# corrPflat[1:4, ]

# corrPlot = corrPflat %>%
#     ggplot(aes(x = corrRank, y = corr)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     ylim(-0.1, 0.3) +
#     scale_color_viridis() +
#     ggtitle("ExG Pearson Corr") +
#     xlab("Rank") +
#     ylab("Pearson corr. coeff.");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_pearson_corrCoeff_rank.png", width=4, height=4);

# corrPlot = corrPflat %>%
#     ggplot(aes(x = pValRank, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG Pearson Corr") +
#     xlab("Rank") +
#     ylab("Pearson corr. p-value");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_pearson_corrPvalue_rank.png", width=4, height=4);

# corrPlot = corrPflat %>%
#     ggplot(aes( x = corr, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG Pearson Corr") +
#     xlab("Pearson corr. coeff.") +
#     ylab("Pearson corr. p-value") ;
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_pearson_corrCoeff_vs_pvalue.png", width=4, height=4)

#### Spearman correlation on non-binarized data:

In [None]:
# use hmisc package to calculate correlation and p-value:
corrS = rcorr(obsx, type = "spearman")

# corrSflat = flattenCorrMatrix( corrS$r, corrS$P ) %>%
#   filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) != "GN-" ) %>%
#     # filter rows such that both Gene and Enhancer were assigned pairs (based on same row in SEcenter):
#     filter( paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
#     # filter(as.character(seqnames(SEcenter[Gene])) == as.character(seqnames(SEcenter[Enhancer])));
#     mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
#     mutate( corrRank = rank(corr) ) %>%
#     mutate( pValRank = rank(pVal) );

# fwrite(corrSflat, file="../data/scGROv2p8_SE_CRISPRverified_5Kbins_spearman_correlation.csv.gz")
# dim(corrSflat)
# corrSflat[1:4, ]

# corrPlot = corrSflat %>%
#     ggplot(aes(x = corrRank, y = corr)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     ylim(-0.1, 0.3) +
#     scale_color_viridis() +
#     ggtitle("ExG spearman Corr") +
#     xlab("Rank") +
#     ylab("Spearman corr. coeff.");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_spearman_corrCoeff_rank.png", width=4, height=4);

# corrPlot = corrSflat %>%
#     ggplot(aes(x = pValRank, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG spearman Corr") +
#     xlab("Rank") +
#     ylab("Spearman corr. p-value");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_spearman_corrPvalue_rank.png", width=4, height=4);

# corrPlot = corrSflat %>%
#     ggplot(aes( x = corr, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG spearman Corr") +
#     xlab("Spearman corr. coeff.") +
#     ylab("Spearman corr. p-value") ;
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_spearman_corrCoeff_vs_pvalue.png", width=4, height=4)

#### Pearson/Spearman on binary matrix
##### Pearson and Spearman on binary data gives same result

In [None]:
# Binarize observed counts
obsb = observed;
obsb[obsb>1] = 1;
# transforming the matrix to calculate corr, hmisc package reqires genes in columns and cells in rows
obsb = as.matrix(t(obsb))
dim(obsb)
obsb[1:5,1:5]

In [None]:
# use hmisc package to calculate correlation and p-value:
corrB = rcorr(obsb, type = "pearson")

# corrBflat = flattenCorrMatrix( corrB$r, corrB$P ) %>%
#     filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) != "GN-" ) %>%
#     # filter rows such that both Gene and Enhancer were assigned pairs (based on same row in SEcenter):
#     filter( paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
#     # filter(as.character(seqnames(SEcenter[Gene])) == as.character(seqnames(SEcenter[Enhancer])));
#     # filter genes that have SE in intorn:
#     mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
#     mutate( corrRank = rank(corr) ) %>%
#     mutate( pValRank = rank(pVal) );

# fwrite(corrBflat, file="../data/scGROv2p8_SE_CRISPRverified_5Kbins_binary_correlation.csv.gz")
# dim(corrBflat)
# corrBflat[1:4, ]

# corrPlot = corrBflat %>%
#     ggplot(aes(x = corrRank, y = corr)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     ylim(-0.1, 0.3) +
#     scale_color_viridis() +
#     ggtitle("ExG Binary Corr") +
#     xlab("Rank") +
#     ylab("Binary corr. coeff.");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_binary_corrCoeff_rank.png", width=4, height=4);

# corrPlot = corrBflat %>%
#     ggplot(aes(x = pValRank, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG Binary Corr") +
#     xlab("Rank") +
#     ylab("Binary corr. p-value");
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_binary_corrPvalue_rank.png", width=4, height=4);

# corrPlot = corrBflat %>%
#     ggplot(aes( x = corr, y = pVal)) +
#     geom_point(pch = 16, aes(color = pAdj)) +
#     scale_color_viridis() +
#     ggtitle("ExG Binary Corr") +
#     xlab("Binary corr. coeff.") +
#     ylab("Binary corr. p-value") ;
# corrPlot
# # ggsave(corrPlot, filename = "../plots/scGRO_SE_CRISPRverified_5Kbins_binary_corrCoeff_vs_pvalue.png", width=4, height=4)

### Without filtering for assigned SE to gene

In [None]:
corrPflatRaw = flattenCorrMatrix( corrP$r, corrP$P )
corrSflatRaw = flattenCorrMatrix( corrS$r, corrS$P )
corrBflatRaw = flattenCorrMatrix( corrB$r, corrB$P )
# join two tables using two columns
corrRaw = left_join(   corrPflatRaw, corrSflatRaw, by=c("Gene", "Enhancer"), suffix = c(".P", ".S") ) %>%
          left_join(., corrBflatRaw, by=c("Gene", "Enhancer") ) %>%
          mutate( corrRank = rank(corr) ) %>%
          mutate( pValRank = rank(pVal) ) %>%
          mutate( GeneBin     = sub("^.*_", "", Gene) ) %>%
          mutate( EnhancerBin = sub("^.*_", "", Enhancer) ) %>%
          mutate( Enh_Gene = as.numeric(EnhancerBin) - as.numeric(GeneBin) ) %>%
          mutate( pAdj = p.adjust(pVal, method = "fdr") ) %>%
          # na.omit();
          arrange( Gene );
    
dim(corrRaw)
table(substr(corrRaw$Gene, 0, 3));
table(substr(corrRaw$Enhancer, 0, 3));
corrRaw[1:4, ]

fwrite(corrRaw, file="../data/scGROv2p8_ExG_CRISPRverified_5Kbins_ALL_correlation.csv.gz")

# # lookup gene attributes
# Gene = allfeatures[ corrRaw$Gene ];
# Enhancer = allfeatures[ corrRaw$Enhancer ];
# corrRaw$lengthG = width(Gene);
# corrRaw$lengthE = width(Enhancer);
# corrRaw$strG    = as.character(strand(Gene));
# corrRaw$strE    = as.character(strand(Enhancer));

# # compute distance between gene promoters
# proG = resize(Gene, width=1, fix="start");
# proE = resize(Enhancer, width=1, fix="center");
# strand(proG) = "*";
# strand(proE) = "*";
# # corrRaw$distance = width(pgap( proG, proE ));
# corrRaw$distance = ifelse(as.character(seqnames(proG)) == as.character(seqnames(proE)), width(pgap( proG, proE )), "NA");
# corrRaw[1:4, ]

In [None]:
# CRISPR verified genes and thier SEs bins:
corrRawF = corrRaw %>%
    # filter for GN- and thier SEs:
    filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
    # Ensure that the CGN and SEs are proper pairs:
    filter( paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
    filter( pAdj < 0.05 ) %>%
    arrange( Gene );
dim(corrRawF)
corrRawF
corrRawF$Enh_Gene

SE_gene_bin_hist = corrRawF %>%
    ggplot(aes(x = Enh_Gene)) +
    geom_histogram() +
    geom_histogram(bins = 13, position = "identity", color = "white") +
    geom_density() +
    xlim(-6, 6) +
    theme(legend.position="none") +
    ggtitle("Binary (SE-Gene)") +
    xlab("Bin difference") +
    ylab("Number of pairs") +
    scale_fill_manual(values=c("#39568CFF")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold"))
SE_gene_bin_hist

summary_bin_hist = data.frame(
    Neg = sum(corrRawF$Enh_Gene < 0)/length(corrRawF$Enh_Gene),
    Zero = sum(corrRawF$Enh_Gene == 0)/length(corrRawF$Enh_Gene),
    Pos = sum(corrRawF$Enh_Gene > 0)/length(corrRawF$Enh_Gene)
    ) %>%
    pivot_longer(1:3, names_to = 'Category', values_to = 'Incidence') %>%
    # use fct_inorder from forcats (part of tidyverse) to keep the original order of the data:
    ggplot(aes(x = fct_inorder(Category), y = Incidence)) +
    # geom_bar(stat="identity", fill=c("#9e9a75","gray", "#41533b")) +
    geom_col(fill=c("#9e9a75","gray", "#41533b")) +
    theme(legend.position="none") +
    ggtitle("Binary (SE-Gene)") +
    xlab("Class") +
    ylab("Fraction of pairs") +
    # scale_fill_manual(values=c("#9e9a75","gray", "#41533b")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold"))
summary_bin_hist

In [None]:
# CRISPR verified genes and cross SEs bins:
corrRawCross = corrRaw %>%
    # filter for GN- and thier SEs:
    filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
    filter( sub("_[^_]+$", "", Gene) %in% SEassignment$assignedGene & sub("_[^_]+$", "", Enhancer) %in% SEassignment$name ) %>%
    # Ensure that the CGN and SEs are proper pairs:
    filter( !paste0(sub("_[^_]+$", "", Gene), sub("_[^_]+$", "", Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name) ) %>%
    filter( pAdj < 0.05 ) %>%
    arrange( Gene );
dim(corrRawCross)
corrRawCross[1:4, ]
corrRawCross$Enh_Gene

SE_gene_bin_hist = corrRawCross %>%
    ggplot(aes(x = Enh_Gene)) +
    geom_histogram() +
    geom_histogram(bins = 13, position = "identity", color = "white") +
    geom_density() +
    xlim(-6, 6) +
    theme(legend.position="none") +
    ggtitle("Binary (SE-Gene)") +
    xlab("Bin difference") +
    ylab("Number of pairs") +
    scale_fill_manual(values=c("#39568CFF")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold"))
SE_gene_bin_hist

summary_bin_hist = data.frame(
    Neg = sum(corrRawCross$Enh_Gene < 0)/length(corrRawCross$Enh_Gene),
    Zero = sum(corrRawCross$Enh_Gene == 0)/length(corrRawCross$Enh_Gene),
    Pos = sum(corrRawCross$Enh_Gene > 0)/length(corrRawCross$Enh_Gene)
    ) %>%
    pivot_longer(1:3, names_to = 'Category', values_to = 'Incidence') %>%
    # use fct_inorder from forcats (part of tidyverse) to keep the original order of the data:
    ggplot(aes(x = fct_inorder(Category), y = Incidence)) +
    # geom_bar(stat="identity", fill=c("#9e9a75","gray", "#41533b")) +
    geom_col(fill=c("#9e9a75","gray", "#41533b")) +
    theme(legend.position="none") +
    ggtitle("Binary (SE-Gene)") +
    xlab("Class") +
    ylab("Fraction of pairs") +
    # scale_fill_manual(values=c("#9e9a75","gray", "#41533b")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold"))
summary_bin_hist

In [None]:
t.test(corrRawF$Enh_Gene, corrRawCross$Enh_Gene, alternative = "two.sided")
# wilcox.test(corrRawF$Enh_Gene, corrRawCross$Enh_Gene)
# ks.test(corrRawF$Enh_Gene, corrRawCross$Enh_Gene, alternative = "less")

In [None]:
# overlapping histogram:
corrRawF = corrRaw %>%
        # filter for GN- and thier SEs:
    filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
    filter( sub("_[^_]+$", "", Gene) %in% SEassignment$assignedGene & sub("_[^_]+$", "", Enhancer) %in% SEassignment$name ) %>%
    filter( pAdj < 0.05 );
dim(corrRawF)
corrRawF[1:4, ]

# make a new column class to plot all vs significant as overlapping histogram:
corrRawF$class = ifelse(paste0(sub("_[^_]+$", "", corrRawF$Gene), sub("_[^_]+$", "", corrRawF$Enhancer)) %in% paste0(SEassignment$assignedGene, SEassignment$name), "CRISPR verified pairs", "aScrambled random pairs")
summary(as.factor(corrRawF$class))
corrRawF[1:4, ]

corrRawF %>%
    ggplot( aes(x=Enh_Gene, fill=class)) +
    geom_histogram(binwidth = 1, position = "identity", 
                   alpha = 0.6, 
                   # color = "white",
                   # mapping = aes(y = after_stat(count))) +
                   # mapping = aes(y = after_stat(ncount))) +
                   # mapping = aes(y = after_stat(count/sum(count)))) +
                   mapping = aes(y = after_stat(density))) +
    scale_fill_manual(values=c("#7c6a77", "#011a0c")) +
    xlim(-4, 4) +
    theme(legend.position = c(0.8, 0.875), 
          legend.text = element_text(size=10, face="bold"),
          legend.key.size = unit(0.5, 'cm')) +
    guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    xlab("Difference in correlated bins (Enhancer - Gene)") +
    ylab("Density")
ggsave(filename="../plots/ExG_Difference_in_correlated_bins_E-G.pdf", width=4, height=4, units="in")

corrRawF %>%
    ggplot( aes(x=Enh_Gene, fill=class)) +
    geom_histogram(binwidth = 1, position = "identity", alpha = 0.6) +
    xlim(-4, 4) +
    theme(legend.position = "none") +
    guides(fill=guide_legend(title="")) +

    facet_wrap(~class, scales = "free") +
    scale_fill_manual(values=c("#7c6a77", "#011a0c")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Difference in correlated bin position \n from TSS (Enhancer - Gene)") +
    ylab("Number of pairs")
ggsave(filename="../plots/ExG_Difference_in_correlated_bins_E-G_facets.pdf", width=4, height=4, units="in")

corrRawF %>%
    ggplot( aes(x=Enh_Gene, fill=class)) +
    geom_histogram(binwidth = 1, position = "identity", alpha = 0.6, mapping = aes(y = after_stat(density))) +
    xlim(-4, 4) +
    theme(legend.position = "none") +
    # theme(legend.position = c(0.8, 0.875), 
    #       legend.text = element_text(size=10, face="bold"),
    #       legend.key.size = unit(0.5, 'cm')) +
    # guides(fill=guide_legend(title="")) +

    facet_wrap(~class) + # , scales = "free"
    scale_fill_manual(values=c("#7c6a77", "#011a0c")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold")) +

    # ggtitle("Co-expressed genes") +
    xlab("Difference in correlated bin position from TSS \n (Enhancer - Gene)") +
    ylab("Density")
ggsave(filename="../plots/ExG_Difference_in_correlated_bins_E-G_facets_density.pdf", width=6, height=4, units="in")

t.test(Enh_Gene ~ class, corrRawF);

In [None]:
# SEneighbors genes and SEs bins:
corrSEneighbors = corrRaw %>%
    filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
    filter( sub("_[^_]+$", "", Gene) %in% names(SEneighbors[SEneighbors$TADstatus == "IN", ])) %>%
    filter( as.character(seqnames(allfeatures[Gene])) == as.character(seqnames(allfeatures[Enhancer])) );
nrow(corrSEneighbors)

corrSEneighborsSig = corrSEneighbors %>%
    filter( pAdj < 0.05 );
nrow(corrSEneighborsSig)

SE_gene_bin_hist = corrSEneighborsSig %>%
    ggplot(aes(x = Enh_Gene)) +
    geom_histogram() +
    geom_histogram(binwidth = 1, position = "identity", color = "white") +
    geom_density() +
    theme(legend.position="none") +
    ggtitle("Binary (SE-Gene)") +
    xlab("Bin difference") +
    ylab("Number of pairs") +
    scale_fill_manual(values=c("#39568CFF")) +
    theme(strip.background = element_blank(),
        strip.text = element_text(size = 14, face="bold"))
SE_gene_bin_hist

In [None]:
corrSEout = corrRaw %>%
    filter( substr(Gene, 0, 3) == "GN-" & substr(Enhancer, 0, 3) == "INT" ) %>%
    filter( sub("_[^_]+$", "", Gene) %in% names(SEneighbors[SEneighbors$TADstatus == "OUT", ])) %>%
    filter( as.character(seqnames(allfeatures[Gene])) == as.character(seqnames(allfeatures[Enhancer])) );
nrow(corrSEout)

corrSEoutSig = corrSEout %>%
    filter( pAdj < 0.05 );
nrow(corrSEoutSig)

In [None]:
# Histone genes and SEs bins:
corrHistones = corrRaw %>%
    filter( substr(Gene, 0, 7) == "GN-Hist" & substr(Enhancer, 0, 3) == "INT" );
nrow(corrHistones)

corrHistonesSig = corrHistones %>%
    filter( pAdj < 0.05 );
nrow(corrHistonesSig)

In [None]:
# G1 genes and SEs bins:
corrG1 = corrRaw %>%
    filter( sub("_[^_]+$", "", Gene) %in% G1_Beyrouthy & substr(Enhancer, 0, 3) == "INT" );
nrow(corrG1)

corrG1Sig = corrG1 %>%
    filter( pAdj < 0.05 );
nrow(corrG1Sig)

In [None]:
# G1S and SEs bins:
corrG1S = corrRaw %>%
    filter( sub("_[^_]+$", "", Gene) %in% G1S_Riba & substr(Enhancer, 0, 3) == "INT" );
nrow(corrG1S)

corrG1SSig = corrG1S %>%
    filter( pAdj < 0.05 );
nrow(corrG1SSig)

In [None]:
# G1S and SEs bins:
corrG2M = corrRaw %>%
    filter( sub("_[^_]+$", "", Gene) %in% G2M_Riba & substr(Enhancer, 0, 3) == "INT" );
nrow(corrG2M)

corrG2MSig = corrG2M %>%
    filter( pAdj < 0.05 );
nrow(corrG2MSig)

In [None]:
# barplot showing the expression of SEs in various cell cycles and TADs:
SEcellcycle = data.frame(
    class = c("G1S", 
              "S", 
              "G2M"),
    percentage = c(nrow(corrG1SSig)/nrow(corrG1S)*100,
                   nrow(corrHistonesSig)/nrow(corrHistones)*100,
                   nrow(corrG2MSig)/nrow(corrG2M)*100
                  ));
SEcellcycle
ggplot(SEcellcycle, aes(x=fct_inorder(class), y=percentage)) +
  geom_bar(stat="identity", fill=c("#ebc14c", "#265369", "#545756")) + #333333
  labs(x="Cell Cycle Phase", 
       # title="Percentage of Cells in Each Phase",
       y="Percentage", )+
    ylim(0, 15)
    # scale_fill_manual(values=c("#7c6a77", "#011a0c")) +
    # geom_density() +
    # xlim(0, 2500000) +
    # theme(legend.position = c(0.8, 0.9), 
          # legend.text = element_text(size=10, face="bold"),
          # legend.key.size = unit(0.7, 'cm')) +
    # guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    # xlab("") +
ggsave(filename="../plots/ExG_crisprVerifiedSEs_expression_in_cellCycle.pdf", width=4, height=4, units="in")

In [None]:
# barplot showing the expression of SEs in various cell cycles and TADs:
SEcellcycle = data.frame(
    class = c("G1S", 
              "S", 
              "G2M", 
              "Inside 1 Mb", 
              "Between 4-5 Mb"),
    percentage = c(nrow(corrG1SSig)/nrow(corrG1S)*100,
                   nrow(corrHistonesSig)/nrow(corrHistones)*100,
                   nrow(corrG2MSig)/nrow(corrG2M)*100,
                   nrow(corrSEneighborsSig)/nrow(corrSEneighbors)*100,
                   nrow(corrSEoutSig)/nrow(corrSEout)*100
                  ));
SEcellcycle
ggplot(SEcellcycle, aes(x=fct_inorder(class), y=percentage)) +
  geom_bar(stat="identity", fill=c("#ebc14c", "#265369", "#545756", rep("#818664", 2))) + #333333
  labs(x="Cell Cycle Phase", 
       # title="Percentage of Cells in Each Phase",
       y="Percentage", )+
    ylim(0, 15)
    # scale_fill_manual(values=c("#7c6a77", "#011a0c")) +
    # geom_density() +
    # xlim(0, 2500000) +
    # theme(legend.position = c(0.8, 0.9), 
          # legend.text = element_text(size=10, face="bold"),
          # legend.key.size = unit(0.7, 'cm')) +
    # guides(fill=guide_legend(title="")) +
    # scale_x_log10() +
    # ggtitle("Co-expressed genes") +
    # xlab("") +
ggsave(filename="../plots/ExG_crisprVerifiedSEs_expression_in_cellCycle_plus_TADs.pdf", width=4, height=4, units="in")

In [None]:
FAIL