In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(scales)
    library(foreach)
    library(doParallel)
    library(rtracklayer)
    library(plyranges)
    library(Matrix)
    library(ggcorrplot)
    library(data.table)
});

In [None]:
registerDoParallel(20);
setDTthreads(threads = 15);

options(
    repr.plot.width=4,
    repr.plot.height=4,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

## 1. Load scGRO-seq reads and custom feature annotations

In [None]:
# load groHMM-extended genes and enhancers
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_v1.bed");
# features = read_bed("../data/groHMM_mES_BRsComb_LP-50_UTS10_features_customized_v2.bed");
features = read_bed("../data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
names(features) = features$name;
features$name=NULL;
features$score=NULL;

In [None]:
# load desired scGRO dataset
scGRO = readRDS("../data/scGROv2p8_consolidated.rds");
scGRO

In [None]:
# Merge experiment ID and cell barcode to create unique cell ID across experiments
scGRO = scGRO %>%
    filter( plateQC & cellQC & umiQC & countQC & miRQC & mapq >= 3 ) %>%
#    filter( umiQC ) %>%
    mutate( cellID = factor( paste( Exp, Plate, Cell, sep="-" ) ) ) %>%
    resize( width=1, fix="end" ) %>%
    select( cellID );
scGRO

# identify all cell IDs
allID = levels(scGRO$cellID);

## 2. Count scGRO-seq reads within each feature and output matrix

In [None]:
# iterate through each cellID,
# and combine outputs using cbind (column bind)
counts = foreach(
    id = allID,
    .combine="cbind",
    .multicombine=T
) %dopar% {
    # get reads from this cellID
    counts = scGRO %>% filter(cellID == id);
    # count reads in each feature
    counts = countOverlaps( features, counts );
    # encode as sparse matrix to save memory
    return( Matrix(counts, sparse=T) );
}
# columns = cells = cellIDs
colnames(counts) = allID;
rownames(counts) = names(features);

In [None]:
saveRDS( counts, file="../data/scGROv2p8_mapq3qc_feature_counts.rds" );
dim(counts);
counts[1:5,1:5];

In [None]:
counts = readRDS("../data/scGROv2p8_mapq3qc_feature_counts.rds" );

In [None]:
# create "stranded" feature coverage set
ftcov = features;
enhf = which(substr(names(features), 0, 2) != "GN");
strand(ftcov)[enhf] = "-";
ftcov = append(ftcov, ftcov[enhf]);
strand(ftcov)[enhf] = "+";

In [None]:
# sum of mm10 chr1-19 and XYM is 2,725,537,669,
# or approx 2.73E9.

# quantify enrichment of reads in features
#sum(counts)/length(scGRO);
#sum(width(ftcov))/5.46E9;

# read density in features:
sum(counts)/sum(width(ftcov));

# read density outside features:
RNIF=length(scGRO)-sum(counts);
RNIF/(5.46E9 - sum(width(ftcov)));

In [None]:
sum(colSums(counts))

In [None]:
reads_per_cell = as.data.frame(table(scGRO$cellID));
colnames(reads_per_cell) = c("ID", "count");
rownames(reads_per_cell) = reads_per_cell$ID;
reads_per_cell = reads_per_cell[ colnames(counts), ];

reads_per_cell %>%
    write.csv( file="../data/scGROv2p8_mapq3qc_ReadsPerCell.csv" );

In [None]:
Plate  = str_match( colnames(counts), "(\\w+-c\\d+)" )[,2];
cellBC = str_match( colnames(counts), "-(\\w+)$" )[,2];

reads_in_features = colSums(counts);
features_per_cell = colSums(counts>0);
pct_in_features   = reads_in_features / reads_per_cell$count;
pct_in_features = as.numeric(pct_in_features);

In [None]:
out = data.frame( rpc=reads_per_cell$count, fpc=features_per_cell, Plate ) %>%
    mutate( exp=substr(Plate, 0, 7) ) %>%
    mutate( exp=sub("-", "", exp, fixed=T) ) %>%
    ggplot( aes(x=rpc, y=fpc, col=exp) ) +
    geom_point(alpha=0.5) +
    xlim(0, 30000) +
    ylim(0, 8000) +
    # ggtitle("Reads vs features") +
    xlab("Reads per cell") +
    ylab("Features per cell") +
    theme( legend.key.size = unit(5, 'mm'), legend.background = element_blank(), 
          legend.title = element_text(size=0), legend.text = element_text(size = 0))

ggsave(out, file="../plots/scGROv2p8_mapq3qc_ReadsVsFeatures.pdf", width=4, height=4);
out

In [None]:
out = data.frame( rpc=reads_in_features, fpc=features_per_cell, Plate ) %>%
    mutate( exp=substr(Plate, 0, 7) ) %>%
    mutate( exp=sub("-", "", exp, fixed=T) ) %>%
    ggplot( aes(x=rpc, y=fpc, col=exp) ) +
    geom_point(alpha=0.5) +
    xlim(0, 16000) +
    ylim(0, 8000) +
    # scale_x_log10() +
    # scale_y_log10() +
    # ggtitle("Reads in features vs features") +
    xlab("Reads in features per cell") +
    ylab("Features per cell") +
    theme( legend.key.size = unit(5, 'mm'), legend.background = element_blank(), 
          legend.title = element_text(size=0), legend.text = element_text(size = 0))

ggsave(out, file="../plots/scGROv2p8_mapq3qc_RIFvsFeatures.pdf", width=4, height=4);
out

In [None]:
summary(pct_in_features[reads_per_cell$count >= 1000])

In [None]:
filtered_cells = reads_in_features >= 750 &
    reads_in_features <= 15000 &
    cellBC != "TTCTTCTTCC" &
    pct_in_features >= 0.30 &
    pct_in_features <= 0.90;

In [None]:
counts = counts[, filtered_cells];
Plate = str_match( colnames(counts), "(\\w+-c\\d+)" )[,2];

Plate_size = table(Plate);
#Plate_size
#which(Plate_size < 40);

In [None]:
Plate_size

In [None]:
counts = counts[, Plate %in% names(which(Plate_size >= 24))];
Plate = str_match( colnames(counts), "(\\w+-c\\d+)" )[,2];
#unique(Plate)
unique(Plate) %>% length

In [None]:
# get average counts for each feature in each plate
plate_counts = foreach(
    p = unique(Plate),
    .combine='cbind'
) %dopar% {
    # only use genes with 1% or higher expression for QC
    rowMeans( counts[rowMeans(counts) > 0.01, Plate == p] );
}
colnames(plate_counts) = unique(Plate);

In [None]:
# compute r^2 for all plates
plate_corr = cor(plate_counts)**2;

In [None]:
out = ggcorrplot(
    plate_corr,
    hc.order = F,
    outline.col = "white"
) +
scale_fill_viridis_c() +
theme(axis.text.x = element_text(angle = 90));

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_rpc750_AllPlateCorr.pdf", width=12, height=12);
out

In [None]:
# retain plates with r^2 > 0.6 against at least 60% of other samples
good_plates = which(colMeans(plate_corr >= 0.6) >= 0.6) %>% names;
good_plates = good_plates[ !good_plates %in% c("Exp263b-c07", "Exp256-c09", "Exp160-c04") ]

#good_plates %>% sort
length(good_plates)

In [None]:
out = ggcorrplot(
    plate_corr[good_plates, good_plates],
    hc.order = F,
    outline.col = "white"
) +
    scale_fill_viridis_c() +
    theme(axis.text.x = element_text(angle = 90));

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_rpc750_GoodPlateCorr.pdf", width=12, height=12);
out

In [None]:
out = ggcorrplot(
    plate_corr[good_plates, good_plates],
    hc.order = T,
    outline.col = "white"
) +
    scale_fill_viridis_c() +
    theme(axis.text.x = element_text(angle = 90));

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_rpc750_GoodPlateCorr_Clustered.pdf", width=12, height=12);
out

In [None]:
# save filtered count matrix
counts = counts[, Plate %in% good_plates];
saveRDS( counts, file="../data/scGROv2p8_mapq3qc_filtered_counts.rds" );

In [None]:
sum(colSums(counts))
dim(counts)
head(counts)[1:5,1:5]

## 3. Display per-cell summary statistics

In [None]:
# get total reads per cell
out = scGRO %>%
    filter( cellID %in% colnames(counts) ) %>%
    group_by(cellID) %>%
    summarise( count = n() );

summary(out$count);
out

In [None]:
test = as.data.frame(out) %>%
    mutate( exp=substr(cellID, 0, 7) ) %>%
    mutate( exp=sub("-", "", exp, fixed=T) ) %>%
    ggplot( aes(x=count, fill=exp) ) +
    geom_histogram(breaks=(0:50)*200) +
    # ggtitle("Total reads per cell") +
    xlab("Reads per cell") +
    ylab("Number of cells") +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))
ggsave(test, filename = "../plots/scGROv2p8_ReadsPerCell_histogram.pdf", width=4, height=4);
test

In [None]:
summary(colSums(counts))

In [None]:
out = data.frame(
    reads=colSums(counts),
    feats=colSums(counts>0),
    Genes=colSums(counts[substr(rownames(counts), 0, 3) == "GN-",]>0),
    Enhancers=colSums(counts[substr(rownames(counts), 0, 3) != "GN-",]>0),
    exp  =substr(colnames(counts), 0, 7)
) %>%
    mutate( exp=sub("-", "", exp, fixed=T) );
head(out)
# mean and median of features per cell:
mean(out$feats)
median(out$feats)

test = out %>%
    ggplot( aes(x=reads, fill=exp) ) +
    geom_histogram(breaks=(0:50)*100) +
    # ggtitle("Reads in features per cell") +
    xlab("Reads in features per cell") +
    ylab("Number of cells") +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))
ggsave(test, filename = "../plots/scGROv2p8_ReadsInFeaturesPerCell_histogram.pdf", width=4, height=4);
test

test = out %>%
    ggplot( aes(x=feats, fill=exp) ) +
    geom_histogram(breaks=(0:50)*100) +
    # ggtitle("Features per cell") +
    xlab("Features per cell") +
    ylab("Number of cells") +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))
ggsave(test, filename = "../plots/scGROv2p8_FeaturesPerCell_histogram.pdf", width=4, height=4);
test

test = out %>%
    ggplot( aes(x=Genes, fill=exp) ) +
    geom_histogram(breaks=(0:50)*100) +
    # ggtitle("Genes per cell") +
    xlab("Genes per cell") +
    ylab("Number of cells") +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))
ggsave(test, filename = "../plots/scGROv2p8_GenessPerCell_histogram.pdf", width=4, height=4);
test

test = out %>%
    ggplot( aes(x=Enhancers, fill=exp) ) +
    geom_histogram(breaks=(0:50)*10) +
    # ggtitle("Enhancers per cell") +
    xlab("Enhancers per cell") +
    ylab("Number of cells") +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))
ggsave(test, filename = "../plots/scGROv2p8_EnhancersPerCell_histogram.pdf", width=4, height=4);
test

In [None]:
data.frame(
    x=rowMeans(counts)
) %>%
    ggplot( aes(x=x) ) +
    geom_histogram(binwidth=0.05, fill="steelblue4") +
    scale_x_log10() +
    ggtitle("Reads per feature") +
    xlab("Average reads per cell") +
    ylab("Number of features");

In [None]:
missed = scGRO %>% subsetByOverlaps( features, invert=T );
rddist = as.data.frame( distanceToNearest( missed, features ) );

ggplot( rddist, aes(x=distance/1000)) +
    geom_histogram(binwidth=2) +
    xlim(0, 150) +
    ggtitle("Reads outside features") +
    xlab("Distance to feature (kb)") +
    ylab("Number of reads");

In [None]:
table(substr(colnames(counts), 0, 7))

## Truncate features to minimize length biases

In [None]:
truncated = features;
longfeats = width(features) > 10000;
truncated[longfeats] = truncated[longfeats] %>%
    resize( width = 10000, fix="start" );

# percent features >20kbp
100*mean(longfeats)

In [None]:
ucounts = foreach(
    id = colnames(counts),
    .combine="cbind",
    .multicombine=T
) %dopar% {
    counts = scGRO %>% filter(cellID == id);
    counts = countOverlaps( truncated, counts );
    return( Matrix(counts, sparse=T) );
}
colnames(ucounts) = colnames(counts);
rownames(ucounts) = names(truncated);
saveRDS( ucounts, file="../data/scGROv2p8_mapq3qc_max10kbp_filtered_counts.rds" );

In [None]:
#% of reads in truncated features comapred to full features:
sum(colSums(ucounts))/sum(colSums(counts))*100

In [None]:
truncated = features;
longfeats = width(features) > 20000;
truncated[longfeats] = truncated[longfeats] %>%
    resize( width = 20000, fix="start" );

# percent features >20kbp
100*mean(longfeats)

In [None]:
ucounts = foreach(
    id = colnames(counts),
    .combine="cbind",
    .multicombine=T
) %dopar% {
    counts = scGRO %>% filter(cellID == id);
    counts = countOverlaps( truncated, counts );
    return( Matrix(counts, sparse=T) );
}
colnames(ucounts) = colnames(counts);
rownames(ucounts) = names(truncated);
saveRDS( ucounts, file="../data/scGROv2p8_mapq3qc_max20kbp_filtered_counts.rds" );

In [None]:
#% of reads in truncated features comapred to full features:
sum(colSums(ucounts))/sum(colSums(counts))*100

In [None]:
truncated = features;
longfeats = width(features) > 30000;
truncated[longfeats] = truncated[longfeats] %>%
    resize( width = 30000, fix="start" );

# percent features >30kbp
100*mean(longfeats)

In [None]:
ucounts = foreach(
    id = colnames(counts),
    .combine="cbind",
    .multicombine=T
) %dopar% {
    counts = scGRO %>% filter(cellID == id);
    counts = countOverlaps( truncated, counts );
    return( Matrix(counts, sparse=T) );
}
# columns = cells = cellIDs
colnames(ucounts) = colnames(counts);
rownames(ucounts) = names(truncated);
saveRDS( ucounts, file="../data/scGROv2p8_mapq3qc_max30kbp_filtered_counts.rds" );

In [None]:
#% of reads in truncated features comapred to full features:
sum(colSums(ucounts))/sum(colSums(counts))*100

In [None]:
length(features)
summary(width(features))
summary(width(features) <= 3000)

In [None]:
genebody = features %>%
    filter( substr(names, 0, 2) == "GN" ) %>%
    filter( width >= 3000 ) %>%
    anchor_center() %>%
    mutate( width = width - 1000 );

enhs = features %>%
    filter( substr(names, 0, 2) != "GN" )%>%
    filter( width >= 2000 ) %>%
    anchor_center() %>%
    mutate( width = width - 500 );

gbfeatures = c( genebody, enhs )

In [None]:
gcounts = foreach(
    id = colnames(counts),
    .combine="cbind",
    .multicombine=T
) %dopar% {
    counts = scGRO %>% filter(cellID == id);
    counts = countOverlaps( gbfeatures, counts );
    return( Matrix(counts, sparse=T) );
}
# columns = cells = cellIDs
colnames(gcounts) = colnames(counts);
rownames(gcounts) = names(gbfeatures);
saveRDS( gcounts, file="../data/scGROv2p8_mapq3qc_TSSPAStruncated_filtered_counts.rds" );

## Subsample cells to a preset number of reads

In [None]:
# subsample a count matrix such that each column
# sums to a desired number N
subsample_columns = function( counts, N ) {
    # remove any columns with sum < N
    counts = counts[ , colSums(counts) >= N ];
    
    nfeat = nrow(counts);
    ncell = ncol(counts);

    sscounts = foreach( i = 1:ncell, .combine = "cbind" ) %dopar% {
        out = rep(0, nfeat);

        # create a vector containing features' numbers
        # repeated by read count
        read_vec = rep( 1:nfeat, times=counts[,i] );
        
        # sample read_vec N times without replacement
        ftcounts = sample( read_vec, size=N, replace=F );
        
        # count samplings of each feature
        ftcounts = as.data.frame(table(ftcounts), stringsAsFactors=F);
        ftcounts[,1] = as.integer(ftcounts[,1]);
        out[ftcounts[,1]] = ftcounts$Freq;

        return( Matrix(out, sparse=T) );
    }
    
    colnames(sscounts) = colnames(counts);
    rownames(sscounts) = rownames(counts);

    return(sscounts);
}

In [None]:
ss_counts = subsample_columns( ucounts, 1000 );
saveRDS( ss_counts, file="../data/scGROv2p8_mapq3qc_max30kbp_ss1000.rds" );
ncol(ss_counts)

In [None]:
ss_counts = subsample_columns( ucounts, 2000 );
saveRDS( ss_counts, file="../data/scGROv2p8_mapq3qc_max30kbp_ss2000.rds" );
ncol(ss_counts)

In [None]:
ss_counts = subsample_columns( ucounts, 3000 );
saveRDS( ss_counts, file="../data/scGROv2p8_mapq3qc_max30kbp_ss3000.rds" );
ncol(ss_counts)

### TSS PAS truncated counts for dREG genes (no groHMM) for scRNAseq/RNAseq comparision:

In [None]:
features = read_bed("../data/dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
names(features) = features$name;
features$name=NULL;
features$score=NULL;

counts = readRDS(file="../data/scGROv2p8_mapq3qc_filtered_counts.rds" );

gbfeatures = features %>%
    filter( substr(names, 0, 2) == "GN" ) %>%
    filter( width >= 3000 ) %>%
    anchor_center() %>%
    mutate( width = width - 1000 );

gcounts = foreach(
    id = colnames(counts),
    .combine="cbind",
    .multicombine=T
) %dopar% {
    counts = scGRO %>% filter(cellID == id);
    counts = countOverlaps( gbfeatures, counts );
    return( Matrix(counts, sparse=T) );
}
# columns = cells = cellIDs
colnames(gcounts) = colnames(counts);
rownames(gcounts) = names(gbfeatures);
saveRDS( gcounts, file="../data/scGROv2p8_dREGgenesForRNAseqComp_TSSPAStruncated_filtered_counts.rds" );