In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-07-18 
Git hash: 300eb006e78f07a2020c11b74cd86d944a6f38d8


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']

In [10]:
# Number of SRRs
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} SRRs in the database.'.format(**cnt))

35,382 SRRs in the database.


In [3]:
# Number of SRR not yet downloaded
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.pre_aln_flags': []}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} SRRs have not been downloaded.'.format(**cnt))

15,677 SRRs have not been downloaded.


In [4]:
# Number of SRR downloaded but not done with pre-alignment
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$and': [
                {'runs.pre_aln_flags': {'$ne': 'complete'}},
                {'runs.pre_aln_flags': {'$ne': []}},
            ],
        }
    },
    {'$count': 'cnt'}
]).next()

print('{cnt:,} SRRs have been downloaded, but have not finished pre-alignment.'.format(**cnt))

3,762 SRRs have been downloaded, but have not finished pre-alignment.


In [5]:
# Number of SRR passed pre-alignment
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.pre_aln_flags': 'complete'}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} SRRs have have finished pre-alignment.'.format(**cnt))

15,943 SRRs have have finished pre-alignment.


In [6]:
# Number of SRR passed alignment
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.aln_flags': 'complete'}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} SRRs have have finished alignment.'.format(**cnt))

10,761 SRRs have have finished alignment.


In [7]:
# Number of SRR passed pre-alignment stranded
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.pre_aln_flags': 'complete'}},
    {'$match': {'runs.pre_aln_flags': 'same_strand'}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} same stranded SRRs that have finished pre-alignment.'.format(**cnt))

cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$and': [
                {'runs.pre_aln_flags': 'complete'},
                {'runs.pre_aln_flags': 'same_strand'},
            ]
        }
    },
    {
        '$group': {
            '_id': '$_id',
            'cnt': {'$sum': 1}
        }
    },
    {'$count': 'cnt'}
]).next()

print('{cnt:,} same stranded SRXs that have finished pre-alignment.'.format(**cnt))

2,546 same stranded SRRs that have finished pre-alignment.
2,342 same stranded SRXs that have finished pre-alignment.


In [8]:
# Number of SRR passed pre-alignment stranded
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.pre_aln_flags': 'complete'}},
    {'$match': {'runs.pre_aln_flags': 'opposite_strand'}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} opposite stranded SRRs that have finished pre-alignment.'.format(**cnt))

cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$and': [
                {'runs.pre_aln_flags': 'complete'},
                {'runs.pre_aln_flags': 'opposite_strand'},
            ]
        }
    },
    {
        '$group': {
            '_id': '$_id',
            'cnt': {'$sum': 1}
        }
    },
    {'$count': 'cnt'}
]).next()

print('{cnt:,} opposite stranded SRXs that have finished pre-alignment.'.format(**cnt))

2,802 opposite stranded SRRs that have finished pre-alignment.
2,573 opposite stranded SRXs that have finished pre-alignment.


In [9]:
# Number of SRR passed pre-alignment stranded
cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {'$match': {'runs.pre_aln_flags': 'complete'}},
    {'$match': {'runs.pre_aln_flags': 'unstranded'}},
    {'$count': 'cnt'}
]).next()

print('{cnt:,} unstranded SRRs that have finished pre-alignment.'.format(**cnt))

cnt = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$and': [
                {'runs.pre_aln_flags': 'complete'},
                {'runs.pre_aln_flags': 'unstranded'},
            ]
        }
    },
    {
        '$group': {
            '_id': '$_id',
            'cnt': {'$sum': 1}
        }
    },
    {'$count': 'cnt'}
]).next()

print('{cnt:,} unstranded SRXs that have finished pre-alignment.'.format(**cnt))

10,595 unstranded SRRs that have finished pre-alignment.
6,848 unstranded SRXs that have finished pre-alignment.


In [15]:
15943 - 10761

5182