# Counts Table pre-pre_aln

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-07-21 
Git hash: 667797bb512f88c9803b2342738a55cbf0c9297a


I envision Table S1 being a summary of the pre-pre-alignment workflow.

Workflow:
* Download FASTQ file
* Determine library layout (aka SE vs PE)
* Calculate md5sum of FASTQ(s)
* Count the total number of reads for FASTQ(s)
* Calculate the average read length for FASTQ(s)

In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']

In [7]:
# Total number
tot = remap.aggregate([
    {'$unwind': '$runs'},
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} runs in the current database.'.format(tot))

There are 35,382 runs in the current database.


In [4]:
# Number of runs with pre_aln_flags
preDone = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': {'$exists': 1},
        }
    },
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} runs that have been downloaded and examine.'.format(preDone))

There are 35,382 runs that have been downloaded and examine.


In [8]:
# Number of runs with pre_aln_flags = download_bad
downloadFail = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'download_bad'
        }
    },
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} runs I could not download.'.format(downloadFail))

There are 49 runs I could not download.


In [11]:
# Number of runs with pre_aln_flags = SE
se = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'SE'
        }
    },
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} SE runs.'.format(se))

There are 13,935 SE runs.


In [12]:
# Number of runs with pre_aln_flags = PE
pe = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'PE',
        }
    },
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} PE runs.'.format(pe))

There are 5,214 PE runs.


In [13]:
# Number of runs with pre_aln_flags = keep aka PE reads with bad mate pair
keep = remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': {'$in': ['keep_R1', 'keep_R2']},
        }
    },
    {'$count': 'cnt'}
]).next()['cnt']

print('There are {:,} runs that looked PE, but had a problem mate.'.format(keep))

There are 432 runs that looked PE, but had a problem mate.


In [None]:
# Number of runs with pre_aln_flags = keep aka PE reads with bad mate pair
list(remap.aggregate([
    {'$unwind': '$runs'},
    {'$unwind': '$runs.pre_aln_flags'},
    {
        '$project': {
            '_id': 0,
            'pre': '$runs.pre_aln_flags'
        }
    },
    {
        '$group': {
            '_id': '$pre',
            'cnt': {'$sum': 1}
        }
    }
]))