In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-10-09 
Git hash: 6981e4093c2e6b3d7a4e4ca51337c4f0c41a767e


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']


In [3]:
srxs = [x['_id'] for x in ncbi.find({'sra.experiment.library_strategy': 'WGS'}, {'_id': 1})]

In [4]:
list(remap.find({'_id': {'$in': srxs}, 'runs.aln_flags': 'complete'}, {'_id': 1, 'runs.srr': 1}))

[{'_id': 'DRX000998', 'runs': [{'srr': 'DRR001444'}]},
 {'_id': 'DRX000999', 'runs': [{'srr': 'DRR001445'}]},
 {'_id': 'DRX001000', 'runs': [{'srr': 'DRR001446'}]},
 {'_id': 'DRX001001', 'runs': [{'srr': 'DRR001447'}]},
 {'_id': 'DRX015073', 'runs': [{'srr': 'DRR016720'}]},
 {'_id': 'DRX015074', 'runs': [{'srr': 'DRR016721'}]},
 {'_id': 'DRX015075', 'runs': [{'srr': 'DRR016722'}]},
 {'_id': 'DRX015076', 'runs': [{'srr': 'DRR016723'}]},
 {'_id': 'DRX042143', 'runs': [{'srr': 'DRR046855'}]},
 {'_id': 'DRX042144', 'runs': [{'srr': 'DRR046856'}]},
 {'_id': 'DRX042145', 'runs': [{'srr': 'DRR046857'}]},
 {'_id': 'DRX042146', 'runs': [{'srr': 'DRR046858'}]},
 {'_id': 'DRX042147', 'runs': [{'srr': 'DRR046859'}]},
 {'_id': 'DRX042148', 'runs': [{'srr': 'DRR046860'}]},
 {'_id': 'DRX042149', 'runs': [{'srr': 'DRR046861'}]},
 {'_id': 'DRX042150', 'runs': [{'srr': 'DRR046862'}]},
 {'_id': 'DRX042151', 'runs': [{'srr': 'DRR046863'}]},
 {'_id': 'DRX042152', 'runs': [{'srr': 'DRR046864'}]},
 {'_id': '

In [8]:
remap.find_one({'_id': 'SRX010956'}, {'runs.srr': 1, 'runs.libsize.R1': 1})

{'_id': 'SRX010956',
 'runs': [{'libsize': {'R1': 6871689}, 'srr': 'SRR026817'},
  {'libsize': {'R1': 4765078}, 'srr': 'SRR026887'},
  {'libsize': {'R1': 6938289}, 'srr': 'SRR026888'},
  {'libsize': {'R1': 9818048}, 'srr': 'SRR026891'},
  {'libsize': {'R1': 9740412}, 'srr': 'SRR026892'},
  {'libsize': {'R1': 9666667}, 'srr': 'SRR026918'},
  {'libsize': {'R1': 8313954}, 'srr': 'SRR026944'},
  {'libsize': {'R1': 8158990}, 'srr': 'SRR026945'},
  {'libsize': {'R1': 6115223}, 'srr': 'SRR026965'},
  {'libsize': {'R1': 5994191}, 'srr': 'SRR026966'}]}

In [15]:
df = pd.DataFrame(list(remap.aggregate([
    {
        '$match': {
            '_id': {'$in': srxs}
        }
    },
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
            'libsize': '$runs.libsize.R1',
            'readLen': '$runs.avgReadLen.R1'
        }
    }
])))

In [13]:
'{:,}'.format(df.libsize.sum())

'13,707,559,102'

In [16]:
df.readLen.describe()

count    561.000000
mean      84.721801
std       35.780052
min        4.000000
25%       45.000000
50%       98.439799
75%      101.000000
max      151.000000
Name: readLen, dtype: float64

In [21]:
df[['srx', 'srr']].to_csv('../../output/wgs_samples.tsv', sep='\t', index=False)