# Dump IDs

I have found that performance of the MongoDB on the cluster was poor. I am trying to refactor things to use the database less. I think switching to a set of HDF5 files seems to make the most sense right now. Here I migrate essential things from the DB and into the HDF5 format. I also use this for a test case for developing easy functions for adding and removing from the data store.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-10-18 
Git hash: 000e0a60bc59f9d36c57202cd127e293564e74f9


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']

In [3]:
from ncbi_remap.io import create_table, add_id, remove_id, remove_chunk

## Create HDF5 data table with all IDs

In [4]:
# Open hdf5 store
store = pd.HDFStore('../../sra.h5')

In [5]:
# Get all IDs in the database
agg = [(x['srx'], x['srr']) for x in remap.aggregate([
    {"$unwind": '$runs'},
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    },
])]

# Create data store table of ids
ids = pd.DataFrame(agg, columns=['srx', 'srr'])
create_table(store, 'ids', ids, force=True, columns='all')

## Create the pre-alignment queue

Here I take all of the IDs and create the pre-alignment queue.

In [6]:
# Go ahead and add these to the queue
create_table(store, 'prealn/queue', ids, force=True, columns='all')

## Create a list of completed IDs

Now I will pull out the completed IDs from the database and remove them from the pre-alignment queue and add them to the alignment queue.

In [7]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [8]:
# Create the pre alignment completed list
create_table(store, 'prealn/complete', df, force=True, columns='all')

# Remove items from the pre-alignment queue
remove_chunk(store, 'prealn/queue', df.srr)

# Create the alignment queue
create_table(store, 'aln/queue', df, force=True, columns='all')

## Create a list of IDs that could not be downloaded

In [9]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'download_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [10]:
create_table(store, 'prealn/download_bad', df, force=True, columns='all')
remove_chunk(store, 'prealn/queue', df.srr)

## Create a list of IDs that were malformed with bad quality scores

In [11]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'quality_scores_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [12]:
create_table(store, 'prealn/quality_scores_bad', df, force=True, columns='all')
remove_chunk(store, 'prealn/queue', df.srr)

## Create a list of IDs that had bad alignments

In [13]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [14]:
create_table(store, 'prealn/alignment_bad', df, force=True, columns='all')
remove_chunk(store, 'prealn/queue', df.srr)

## Create a list of IDs that were from abi solid

In [15]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'abi_solid'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [16]:
create_table(store, 'prealn/abi_solid', df, force=True, columns='all')
remove_chunk(store, 'prealn/queue', df.srr)

In [17]:
store['ids'].shape

(35363, 2)

In [18]:
store['prealn/queue'].shape

(10632, 2)

In [19]:
store['prealn/complete'].shape

(22191, 2)

In [20]:
store['prealn/download_bad'].shape

(72, 2)

In [21]:
store['prealn/quality_scores_bad'].shape

(4, 2)

In [22]:
store['prealn/alignment_bad'].shape

(2242, 2)

In [23]:
store['prealn/abi_solid'].shape

(224, 2)

## Create list of IDs were the alignments are done

In [24]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [26]:
create_table(store, 'aln/complete', df, force=True, columns='all')
remove_chunk(store, 'aln/queue', df.srr)

In [34]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
        
    },
])))

In [35]:
create_table(store, 'aln/alignment_bad', df, force=True, columns='all')
remove_chunk(store, 'aln/queue', df.srr)

In [36]:
store['aln/queue'].shape

(7033, 2)

In [37]:
store['aln/complete'].shape

(15157, 2)

In [38]:
store['aln/alignment_bad'].shape

(84, 2)