In [1]:
from pathlib import Path
import pandas as pd
from pymongo import MongoClient
from IPython.display import display, Markdown

In [2]:
# Connect to SraMongo
client = MongoClient()
db = client["sramongo"]
ncbi = db["ncbi"]

# Pre-Alignment Workflow

In [3]:
display(Markdown("## Number of Samples Downloaded"))
print("Downloaded on:", open("../output/db_download.date").read(), sep="\t")
n_srxs = ncbi.count()
n_srrs = len(list(ncbi.aggregate([{"$unwind": {"path": "$runs"}}, {"$project": {"_id": 1}}])))
print(f"SRXs = {n_srxs:,}", f"SRRs = {n_srrs:,}", sep="\n")

## Number of Samples Downloaded

Downloaded on:	Fri Mar 13 16:21:26 EDT 2020

SRXs = 62,927
SRRs = 73,539


In [4]:
n_download_bad = len(list(Path("../output/fastq-wf/download_bad/").iterdir()))
print(f"Download bad: {n_download_bad:,}")

n_solid = len(list(Path("../output/fastq-wf/abi_solid/").iterdir()))
print(f"ABI Solid: {n_solid:,}")

Download bad: 107
ABI Solid: 4,923


In [5]:
display(Markdown("## Library Size (avg)"))
libsize = pd.read_parquet("../output/fastq-wf/libsize")
libsize[["libsize_R1", "libsize_R2"]].max(axis=1).describe().map(lambda x: f"{x:,.2f}").rename("library_size").to_frame()

## Library Size (avg)

Unnamed: 0,library_size
count,53781.0
mean,14249560.28
std,22887570.28
min,1001.0
25%,731534.0
50%,6622368.0
75%,19525308.0
max,745596549.0


In [6]:
display(Markdown("## Library Layout"))
srx2srr = pd.read_csv("../output/srx2srr.csv")
layout = pd.read_parquet("../output/fastq-wf/layout").layout
layout_cnts = (
    srx2srr.merge(layout, on="srr")
    .groupby("srx").layout.first()
    .value_counts()
    .rename_axis("Library Layout")
    .rename("# of SRXs")
)
display(layout_cnts.map(lambda x: f"{x:,}"))
print(f"Single-Ended: {layout_cnts.drop('PE').sum():,}")

## Library Layout

Library Layout
SE         22,466
PE         13,242
keep_R1     8,763
keep_R2       172
Name: # of SRXs, dtype: object

Single-Ended: 31,401


In [8]:
display(Markdown("## Top 10 Library Strategy Table"))
(
    pd.read_parquet("../output/library_strategy-wf/sra_strategy_selection.parquet")
    .library_strategy.value_counts()
    .rename("Number of Samples")
    .rename_axis("Library Strategy")
    .map(lambda x: f"{x:,}")
    .to_frame()
)

## Top 10 Library Strategy Table

Unnamed: 0_level_0,Number of Samples
Library Strategy,Unnamed: 1_level_1
RNA-Seq,24270
OTHER,12111
WGS,8472
EST,7324
ChIP-Seq,6556
miRNA-Seq,756
ncRNA-Seq,733
AMPLICON,713
RIP-Seq,438
ATAC-seq,405
