# Dump IDs

I have found that performance of the MongoDB on the cluster was poor. I am trying to refactor things to use the database less. I think switching to a set of HDF5 files seems to make the most sense right now. Here I migrate essential things from the DB and into the HDF5 format. I also use this for a test case for developing easy functions for adding and removing from the data store.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-10-20 
Git hash: 72b477f885a1a67866e59e27125542ce213bb2f4


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']

In [3]:
from ncbi_remap.io import add_table, add_id, remove_id, remove_chunk

## Create HDF5 data table with all IDs

In [4]:
# Open hdf5 store
store = pd.HDFStore('../../output/sra.h5')

In [5]:
# Get all IDs in the database
agg = [(x['srx'], x['srr']) for x in remap.aggregate([
    {"$unwind": '$runs'},
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr'
        }
    },
])]

# Create data store table of ids
ids = pd.DataFrame(agg, columns=['srx', 'srr'])
add_table(store, 'ids', ids, force=True, columns='all')

In [6]:
store['ids'].shape[0]

35363

## Pre-Alignment Workflow

### Create the pre-alignment queue

Here I take all of the IDs and create the pre-alignment queue.

In [7]:
# Go ahead and add these to the queue
add_table(store, 'prealn/queue', ids, force=True, columns='all')

store['prealn/queue'].shape[0]

35363

### Create a list of completed IDs

Now I will pull out the completed IDs from the database and remove them from the pre-alignment queue and add them to the alignment queue.

In [8]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

# Create the pre alignment completed list
add_table(store, 'prealn/complete', df, force=True, columns='all')
print(store['prealn/complete'].shape[0])

# Remove items from the pre-alignment queue
remove_chunk(store, 'prealn/queue', df.srr)
print(store['prealn/queue'].shape[0])

# Create the alignment queue
add_table(store, 'aln/queue', df, force=True, columns='all')
print(store['aln/queue'].shape[0])

22191
13172
22191


### Create a list of IDs that could not be downloaded

In [9]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'download_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'prealn/download_bad', df, force=True, columns='all')
print(store['prealn/download_bad'].shape[0])

remove_chunk(store, 'prealn/queue', df.srr)
print(store['prealn/queue'].shape[0])

72
13100


### Create a list of IDs that were malformed with bad quality scores

In [10]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'quality_scores_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'prealn/quality_scores_bad', df, force=True, columns='all')
print(store['prealn/quality_scores_bad'].shape[0])

remove_chunk(store, 'prealn/queue', df.srr)
print(store['prealn/queue'].shape[0])

4
13096


### Create a list of IDs that had bad alignments

In [11]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'prealn/alignment_bad', df, force=True, columns='all')
print(store['prealn/alignment_bad'].shape[0])

remove_chunk(store, 'prealn/queue', df.srr)
print(store['prealn/queue'].shape[0])


2242
10856


### Create a list of IDs that were from abi solid

In [12]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'abi_solid'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'prealn/abi_solid', df, force=True, columns='all')
print(store['prealn/abi_solid'].shape[0])

remove_chunk(store, 'prealn/queue', df.srr)
print(store['prealn/queue'].shape[0])


224
10632


### Add layout information

In [13]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'PE'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'layout/PE', df, force=True, columns='all')
print(store['layout/PE'].shape[0])

6881


In [14]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'keep_R1'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'layout/keep_R1', df, force=True, columns='all')
print(store['layout/keep_R1'].shape[0])

400


In [15]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'keep_R2'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'layout/keep_R2', df, force=True, columns='all')
print(store['layout/keep_R2'].shape[0])

327


In [16]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$and': [
                {'runs.pre_aln_flags': 'SE'},
                {'runs.pre_aln_flags': {'$ne': 'keep_R1'}},
                {'runs.pre_aln_flags': {'$ne': 'keep_R2'}},
            ]
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'layout/SE', df, force=True, columns='all')
print(store['layout/SE'].shape[0])

19911


### Add Strand information

In [17]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$or': [
                {'runs.pre_aln_flags': 'same_strand'},
                {'runs.pre_aln_flags': 'first_strand'},
            ]
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'strand/first', df, force=True, columns='all')
print(store['strand/first'].shape[0])

3037


In [18]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            '$or': [
                {'runs.pre_aln_flags': 'opposite_strand'},
                {'runs.pre_aln_flags': 'second_strand'},
            ]
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'strand/second', df, force=True, columns='all')
print(store['strand/second'].shape[0])

5435


In [19]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'unstranded'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'strand/unstranded', df, force=True, columns='all')
print(store['strand/unstranded'].shape[0])

14556


### Summary counts

In [20]:
store['ids'].shape[0]

35363

In [21]:
store['prealn/queue'].shape[0]

10632

In [22]:
store['prealn/complete'].shape[0]

22191

In [23]:
store['prealn/download_bad'].shape[0]

72

In [24]:
store['prealn/quality_scores_bad'].shape[0]

4

In [25]:
store['prealn/alignment_bad'].shape[0]

2242

In [26]:
store['prealn/abi_solid'].shape[0]

224

In [27]:
store['layout/SE'].shape[0]

19911

In [28]:
store['layout/PE'].shape[0]

6881

In [29]:
store['layout/keep_R1'].shape[0]

400

In [30]:
store['layout/keep_R2'].shape[0]

327

In [31]:
store['strand/first'].shape[0]

3037

In [32]:
store['strand/second'].shape[0]

5435

In [33]:
store['strand/unstranded'].shape[0]

14556

## Alignment Workflow

### Create list of IDs were the alignments are done

In [34]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'aln/complete', df, force=True, columns='all')
print(store['aln/complete'].shape[0])

remove_chunk(store, 'aln/queue', df.srr)
print(store['aln/queue'].shape[0])

15157
7034


In [35]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
        
    },
])))

add_table(store, 'aln/alignment_bad', df, force=True, columns='all')
print(store['aln/alignment_bad'].shape[0])

remove_chunk(store, 'aln/queue', df.srr)
print(store['aln/complete'].shape[0])

84
15157


### Summary Counts

In [36]:
print(store['aln/queue'].shape[0])

7033


In [37]:
print(store['aln/complete'].shape[0])

15157


In [38]:
print(store['aln/alignment_bad'].shape[0])

84


## Clean up

In [39]:
store.close()