# Filter reads from pre-processed scGRO-seq datasets
Some PCR duplicates and low-quality UMIs may still be present in our data, so we will perform detailed checks throughout this notebook to identify and mark problematic reads.

In [None]:
.libPaths("/home/mahat/.conda/envs/r422/lib/R/library")
.libPaths()

In [None]:
suppressMessages({
    library(tidyverse)
    library(plyranges)
    library(GenomicFiles)
    library(GenomicAlignments)
    library(rtracklayer)
    library(dtplyr)
    library(data.table)
    library(parallel)
    library(doParallel)
    library(ggridges)
});

In [None]:
setDTthreads(threads = 15);
registerDoParallel(15);

options(
    repr.plot.width=6,
    repr.plot.height=8,
    jupyter.plot_mimetypes = "image/svg+xml"
);
theme_set(theme_classic() +
    theme(
        axis.title.x = element_text(color="black", size=14, face="bold"), 
        axis.title.y = element_text(color="black", size=14, face="bold"),
        axis.text = element_text(color="black", size=12, face="bold"),
        plot.title = element_text(face="bold", size=14, hjust = 0.5),
        axis.line = element_blank(),
        # axis.ticks = element_blank()
        panel.border = element_rect(colour = "grey", fill=NA, linewidth=1)
    )
);

In [None]:
scGRO = readRDS("/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/scGROv2p8_consolidated.rds");
length(scGRO) %>% prettyNum(big.mark = ",")
# v2p8: 26,453,177

In [None]:
scGRO

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=n()) %>%
    as.data.frame
out

In [None]:
# load groHMM-extended genes and enhancers
features = read_bed("/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/groHMM_dREG_refinedFeatures_mES_mm10_OSNcustomEnhancers_SEs.bed");
names(features) = features$name;

# define sense & antisense strands from genes
genef = features[ substr(names(features), 0, 2) == "GN" ];
antif = genef %>%
    mutate( strand = ifelse( strand == "+", "-", "+" ) );

In [None]:
# We noticed significant over-representation of small non-coding RNA,
# presumably from non-specific RT/amplification. These miniRNA
# are blacklisted based on RefSeq, and will now be marked.
load("/net/bmc-lab2/data/lab/scgroseq/group/notebooks/data/UCSC_mm10_ncbiRefSeq.Rdata");
miniRNA = ref_transcripts %>%
    filter( substr(tx_name, 0, 2) != "NM" ) %>%
    filter( width <= 200);

scGRO$miRQC = TRUE;

hits = findOverlaps( scGRO, miniRNA, ignore.strand=T, maxgap = 200 );
scGRO$miRQC[ hits@from ] = FALSE;

In [None]:
# Remove snor..[digit] and Snhg..[digit] genes, which are snoRNA and snoRNA harboring genes. 
snoRNA = features[ grepl( pattern = "GN-Sn[oh][rg]*\\d", names(features) ) ];
names(snoRNA)
hits = findOverlaps( scGRO, snoRNA, ignore.strand=T );
scGRO$miRQC[ hits@from ] = FALSE;
sum(!scGRO$miRQC) %>% prettyNum(big.mark = ",");

In [None]:
# prioritize reads in features & by highest mapq
scGRO$inFeature = overlapsAny( scGRO, features );
scGRO = scGRO[order(-scGRO$inFeature, -scGRO$mapq)] %>%
    as.data.frame %>%
    # Compute 50 bp "bin position" to account for soft clipping during alignment
    mutate(PosBinOne=round(PolPos/50)) %>%
    # Second bin with offset to ensure neighboring reads are not divided improperly
    mutate(PosBinTwo=round((PolPos+25)/50)) %>%
    # Convert to data.table for faster data manipulation
    data.table;

1. umitools dedup can miss some duplicated UMIs at the same 3' end position, so we will re-apply a UMI dedup step.
2. Cell and plate barcodes can be mis-assigned during sequencing, creating duplicates between samples. We mark them here.

In [None]:
# keep at most one polymerase per 50 bp bin within a cell (likely UMI seq errors)
scGRO = scGRO[
    !duplicated(scGRO, by=c("Exp", "Plate", "Cell", "seqnames", "PosBinOne")) &
    !duplicated(scGRO, by=c("Exp", "Plate", "Cell", "seqnames", "PosBinTwo"))
];

nrow(scGRO) %>% prettyNum(big.mark = ",")
# v2p8 20bp stranded:   21,234,867
# v2p8 20bp unstranded: 21,206,317
# v2p8 50bp unstranded: 21,114,657

### Check unique molecular identifiers (UMIs)
Much of our analysis relies on de-duplicating reads to accurately quantify the small number of molecules captured from an individual cell.
Thus, we rely on UMIs to distinguish similar molecules from duplicates created during PCR amplification.
Here we will analyze the randomness of UMI sequences and filter out any unexpected artifactual sequences.

In [None]:
UMIfreq = table(scGRO[ , UMI ]) %>%
    as.data.frame %>%
    mutate(UMI=as.character(Var1)) %>%
    arrange(desc(Freq));
fwrite(UMIfreq, file="../data/scGROv2p8_dedup_UMIfrequencies.csv")

In [None]:
UMIfreq %>%
    filter( nchar(UMI) == 5 ) %>%
    head(n = 40) %>%
    ggplot( aes(x=reorder( UMI, -Freq), y=Freq) ) +
    geom_bar( stat = 'identity' ) +
    theme(axis.text.x = element_text(angle = 90)) +
    xlab("UMI")

In [None]:
outlier_UMIs = UMIfreq %>%
    filter( nchar(UMI) == 5 ) %>%
    head(n = 3) %>%
    select(UMI)

In [None]:
outlier_UMIs = outlier_UMIs$UMI;
outlier_UMIs

# Mark degenerate UMI sequences over-represented in the library
scGRO = scGRO[ , umiQC := !(UMI %in% outlier_UMIs) ];

# also mark alignments that have the same Cell barcode, position, and UMI (mispriming?)
scGRO = scGRO[ umiQC == T, umiQC := (.N == 1), by=c("Cell", "seqnames", "strand", "PolPos", "UMI") ];

## Mark duplicate cell and plate barcodes:
If exact position and UMI matches across multiple cell or plate barcodes -- mark as duplicate

(.N is special variable which represents the group size)


In [None]:
scGRO[, plateQC := F];
scGRO[      umiQC == T, plateQC := (.N == 1), by=c("Exp", "Cell", "seqnames", "strand", "PosBinOne", "UMI") ];
scGRO[ umiQC & plateQC, plateQC := (.N == 1), by=c("Exp", "Cell", "seqnames", "strand", "PosBinTwo", "UMI") ];

scGRO[, cellQC := F];
scGRO[     umiQC == T, cellQC := (.N == 1), by=c("Exp", "Plate", "seqnames", "strand", "PosBinOne", "UMI") ];
scGRO[ umiQC & cellQC, cellQC := (.N == 1), by=c("Exp", "Plate", "seqnames", "strand", "PosBinTwo", "UMI") ];

In [None]:
# Check final (UMI dedup'd) counts within cells -- we know there cannot be more than 2 polymerases
# within 20bp footprint of Pol2. Mark reads violating this expectation.
scGRO[ , countQC := F ];
scGRO[ umiQC & plateQC & cellQC, countQC := (.N <= 2), by=c("Exp", "Plate", "Cell", "seqnames", "PosBinOne") ];
scGRO[ countQC == T, countQC := (.N <= 2), by=c("Exp", "Plate", "Cell", "seqnames", "PosBinTwo") ];

# Finally, check counts per plate per bp and discard positions in >25% of cells
scGRO[ countQC == T, countQC := (.N < 12), by=c("Exp", "Plate", "seqnames", "strand", "PolPos") ];

### Show counts from each QC metric we've developed so far, alone and in combination.

In [None]:
nrow(scGRO)
sum(scGRO[,umiQC  ]);
sum(scGRO[,plateQC]);
sum(scGRO[,cellQC ]);
sum(scGRO[,countQC]);

In [None]:
sum(scGRO[,plateQC & umiQC]);
sum(scGRO[,plateQC & umiQC & countQC]);
sum(scGRO[,plateQC & umiQC & countQC & mapq >= 3]);
sum(scGRO[,plateQC & cellQC & umiQC & countQC & mapq >= 3]);

# v2p5:
# 15966096
# 15749017
# 13907454
# 13708770

# v3p1:
# 5877226
# 5695598
# 4832675
# 4765585

# v2p7
# 20296615
# 20296615
# 17965272
# 17708488

# v2p8 - with Exp236 c05-c08
# 21481792
# 21481792
# 21481792
# 21079904

In [None]:
# count cell barcodes from cells with >20000 reads
scGRO[ plateQC & cellQC & umiQC & countQC & mapq >= 3, .N, by=c("Exp", "Plate", "Cell") ] %>%
    .[ N >= 10000, .N, by="Cell" ] %>%
    as.data.frame %>%
    arrange(desc(N)) %>%
    head(n=15)

# based on the below table, we blacklist TTCTTCTTCC

## Plot summary statistics from individual cells across different batches.

In [None]:
out = scGRO[ plateQC & cellQC & umiQC & countQC & mapq >= 3, .N, by=c("Exp", "Plate", "Cell") ] %>%
    .[ N>= 10000, ] %>%
    ggplot( aes(x=N, y=Cell, fill=Cell) ) +
    geom_density_ridges(breaks=(0:20)*2500, stat='binline', alpha=0.5) +
    xlab("Read count") +
    ylab("Cell barcode") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_CellBC_counts.pdf", width=6, height=12);
out

In [None]:
out = scGRO[ plateQC & umiQC & countQC & mapq >= 3, width, by=c("Exp", "Plate") ] %>%
    ggplot( aes(x=width, y=paste(Exp, Plate), fill=paste(Exp, Plate)) ) +
    geom_density_ridges(breaks=0:100, stat='binline', alpha=0.5) +
    xlab("Read length") +
    ylab("Plate") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_ReadLen.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=sum(plateQC & cellQC & umiQC & countQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, fill=Exp) ) +
    geom_histogram(breaks=(0:30)*500) +
    xlab("Reads per cell") +
    ylab("Number of cells") +
    ylim(0, 300) +
    theme( legend.key.size = unit(6, 'mm'), legend.background = element_blank(), legend.position = c(0.8, 0.6), 
          legend.title = element_text(size = 10), legend.text = element_text(size = 10))

ggsave(out, filename = "../plots/scGROv2p8_qc_ReadsPerCell_Stacked.pdf", width=4, height=4);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=sum(plateQC & umiQC & countQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:30)*500, stat='binline', alpha=0.5) +
    xlab("Total reads per cell") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_qc_ReadsPerCell.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    filter(plateQC & umiQC & countQC & mapq >= 3) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=100*mean(seqnames == "chrM")) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:50)/10, stat='binline', alpha=0.5) +
    xlab("Percent chrM reads per cell") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_chrMReadsPerCell.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=mean(!countQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(stat='binline', alpha=0.5) +
    xlab(">2 UMIs per 20 bp per cell") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_UMI_Duplicates.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=mean(!cellQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(stat='binline', alpha=0.5) +
    xlab("Duplicate cell barcodes per 20 bp bin & UMI") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_CellBC_Duplicates.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=mean(!plateQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(stat='binline', alpha=0.5) +
    xlab("Duplicate plate barcodes (same 20 bp bin & UMI)") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_PlateBC_Duplicates.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=mean(!umiQC)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(stat='binline', alpha=0.5) +
    xlab("GGGGG UMIs") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_DegenerateUMIs.pdf", width=7, height=10);
out

In [None]:
# check final QC'd UMI counts
UMIfreq = table(scGRO[ countQC == T, UMI ]) %>%
    as.data.frame %>%
    mutate(UMI=as.character(Var1)) %>%
    arrange(desc(Freq));
UMIfreq %>%
    filter( nchar(UMI) == 5 ) %>%
    head(n = 40) %>%
    ggplot( aes(x=reorder( UMI, -Freq), y=Freq) ) +
    geom_bar( stat = 'identity' ) +
    theme(axis.text.x = element_text(angle = 90)) +
    xlab("UMI")

In [None]:
scGRO = scGRO %>%
    mutate(PosBinOne=NULL, PosBinTwo=NULL, inFeature=NULL) %>%
    as.data.frame %>%
    GRanges %>%
    sort;


In [None]:
scGRO
saveRDS(scGRO, file="../data/scGROv2p8_consolidated.rds");

In [None]:
scGRO$inFeature = overlapsAny(scGRO, features );
scGRO$inAntisense = overlapsAny( scGRO, antif );

In [None]:
out = scGRO %>%
    filter(cellQC & plateQC & umiQC & countQC & mapq >= 3) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=n()) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:50)*100, stat='binline', alpha=0.5) +
    xlab("Reads in features") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_ReadsInFeatures.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    filter(plateQC & umiQC & countQC & mapq >= 3) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Percent=100*mean(inFeature)) %>%
    as.data.frame %>%
    ggplot( aes(x=Percent, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:20)*5, stat='binline', alpha=0.5) +
    xlab("Percent reads in features") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_PercentReadsInFeatures.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    filter(plateQC & umiQC & countQC & mapq >= 3) %>%
    filter(inAntisense) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=n()) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:50)*200, stat='binline', alpha=0.5) +
    xlab("Reads in non-genic strand") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_ReadsInAntisense.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    filter(plateQC & umiQC & countQC & mapq >= 3) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=mean(inAntisense)) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:50)/10, stat='binline', alpha=0.5) +
    xlab("Percent reads antisense to gene features") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_PercentReadsInAntisense.pdf", width=7, height=10);
out

In [None]:
out = scGRO %>%
    filter((!inFeature) & plateQC & umiQC & countQC & mapq >= 3) %>%
    group_by(Exp, Plate, Cell) %>%
    summarise(Count=n()) %>%
    as.data.frame %>%
    ggplot( aes(x=Count, y=paste(Exp, Plate), fill=Exp) ) +
    geom_density_ridges(breaks=(0:40)*100, stat='binline', alpha=0.5) +
    xlab("Reads not in features") +
    ylab("Density") +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines")
    )

ggsave(out, filename = "../plots/scGROv2p8_mapq3qc_ReadsNotInFeatures.pdf", width=7, height=10);
out