In [1]:
# %load ../start2.py
# Imports
import os
import sys

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../../lib/python')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

# Connect to data store
store = pd.HDFStore('../../output/sra.h5', mode='r')


last updated: 2017-11-12 
Git hash: 9f6e655489b6172aa3ba3b86f86205f4af524129


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']

## How much data have I processed?

In [3]:
# grab info from store
df_store = store['prealn/workflow/fastq'].copy()
r1 = df_store.libsize_R1 * df_store.avgLen_R1
r2 = df_store.libsize_R2 * df_store.avgLen_R2
reads = df_store[['libsize_R1', 'libsize_R2']].max(axis=1)

In [4]:
print('I have completed processing of {:,.0f} trillion bases.'.format((r1.sum() + r2.sum()) / 10**12))
print('I have completed processing of {:,.0f} billion reads.'.format(reads.sum() / 10**9))

I have completed processing of 34 trillion bases.
I have completed processing of 336 billion reads.


## How much does the database say?

In [5]:
df = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$sra.run'},
    {
        '$match': {
            'sra.run.nbases': {'$exists': 1},
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$sra.run.run_id',
            'nbases': '$sra.run.nbases',
            'nreads': {
                '$max': ['$sra.run.read_count_r1', '$sra.run.read_count_r2']
            }
        }
    }
])))

df.set_index(['srx', 'srr'], inplace=True)

In [6]:
print('There are {:,.0f} trillion bases in the database.'.format(df.nbases.sum() / 10**12))
print('There are {:,.0f} billion bases in the database'.format(df.nreads.sum() / 10**9))

There are 52 trillion bases in the database.
There are 483 billion bases in the database


## How much modENCODE was there?

In [7]:
mod = pd.read_csv('../../output/modENCODE_sampletable.tsv', sep='\t')
mod_srx = mod.srx.tolist()

df_mod = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$sra.run'},
    {
        '$match': {
            'sra.run.nbases': {'$exists': 1},
            '_id': {'$in': mod_srx}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$sra.run.run_id',
            'nbases': '$sra.run.nbases',
            'nreads': {
                '$max': ['$sra.run.read_count_r1', '$sra.run.read_count_r2']
            }
        }
    }
])))

df_mod.set_index(['srx', 'srr'], inplace=True)

In [8]:
print('There is {:.0f} times more bases than modEncode'.format(df.nbases.sum() / df_mod.nbases.sum()))
print('There is {:.0f} times more reads than modEncode'.format(df.nreads.sum() / df_mod.nreads.sum()))

There is 21 times more bases than modEncode
There is 16 times more reads than modEncode


## What is the technology breakdown?

In [17]:
df_tech = pd.DataFrame(list(ncbi.aggregate([
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'library_strategy': '$sra.experiment.library_strategy'
        }
    }

])))

df_tech.set_index('srx', inplace=True)

In [24]:
df_tech.library_strategy.value_counts().map(lambda x: '{:,}'.format(x)).to_frame()

Unnamed: 0,library_strategy
RNA-Seq,11804
OTHER,6822
WGS,3737
ChIP-Seq,3474
AMPLICON,441
EST,408
miRNA-Seq,275
ncRNA-Seq,231
MNase-Seq,151
RIP-Seq,121
