# Rebuild SRA Store

In [1]:
import os
import sys
from pathlib import Path
import gzip

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd
from dask.delayed import delayed
from dask.distributed import Client

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.config import DATA_STORE
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra_new.h5')

# Start up cluster
client = Client()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-02-18 
Git hash: 52de887b1e498cf97f88ca6e19510826d15109e4


In [2]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sra']
ncbi = db['ncbi']

## Initialize Store with IDS from SRA Mongo

In [5]:
def get_update_db_ids():
    from pymongo import MongoClient
    try:
        with open('../output/.mongodb_host', 'r') as fh:
            host = fh.read().strip()
    except FileNotFoundError:
        host = 'localhost'

    mongoClient = MongoClient(host=host, port=27022)
    db = mongoClient['sra']
    ncbi = db['ncbi']

    # Dump all ids out of database
    df = pd.DataFrame(list(ncbi.aggregate([
        {
            '$unwind': '$sra.run'
        },
        {
            '$match': {
                'sra.run.run_id': {'$exists': 1}
            }
        },
        {
            '$project': {
                '_id': 0,
                'srx': '$_id',
                'srr': '$sra.run.run_id'
            }
        },
    ])))

    return df[['srx', 'srr']]

In [6]:
curr_sra = get_update_db_ids()

In [8]:
store['ids'] = curr_sra

In [9]:
store['ids'].shape

(51036, 2)

In [10]:
store.put('prealn/queue', store['ids'], data_columns=True, format='table')

In [11]:
store['prealn/queue'].shape

(51036, 2)

## Pre-Alignment Workflow

In [3]:
def check_file(fname):
    with open(fname, 'r') as fh:
        return fh.read().strip()

def get_val(srx, srr, pattern, *args):
    try:
        fname = pattern
        return [srx, srr, check_file(fname.format(srx=srx, srr=srr))]
    except FileNotFoundError:
        return [srx, srr, np.nan]
    
    
def check_solid(srx, srr, *args):
    try:
        layout = check_file(f'../output/prealn-wf/samples/{srx}/{srr}/LAYOUT')
        if layout == 'SE' or layout == 'PE' or layout == 'keep_R1':
            fname = f'../output/prealn-wf/samples/{srx}/{srr}/{srr}_1.fastq.gz'
        else:
            fname = f'../output/prealn-wf/samples/{srx}/{srr}/{srr}_2.fastq.gz'
        
        with gzip.open(fname, 'rt') as fh:
            _, l = fh.readline(), fh.readline()
            if l.startswith('T'): 
                return srx, srr, True
    except:
        pass
    
    return srx, srr, np.nan


def get_vals(pattern, func=get_val, ids=store['ids']):
    futures = []
    for idx, (srx, srr) in ids.iterrows():
        futures.append(delayed(func)(srx, srr, pattern))

    res = client.compute(futures)
    return client.gather(res)

### Layout

In [15]:
dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/LAYOUT')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'layout']).set_index(['srx', 'srr']).iloc[:, 0].dropna()
store.put('layout', srs, data_columns=True, format='table')

### Strand

In [25]:
dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/STRAND')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'strand']).set_index(['srx', 'srr']).iloc[:, 0].dropna()
store.put('strand', srs, data_columns=True, format='table')

### Alignment Bad

In [32]:
dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/ALIGNMENT_BAD')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'alignment_bad']).dropna().drop('alignment_bad', axis=1).reset_index(drop=True)
store.put('prealn/alignment_bad', srs, data_columns=True, format='table')

### Download Bad

In [46]:
dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/DOWNLOAD_BAD')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'download_bad']).dropna().drop('download_bad', axis=1).reset_index(drop=True)
store.put('prealn/download_bad', srs, data_columns=True, format='table')

### Quality Scores Bad

In [64]:
dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/QUALITY')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'quality_scores_bad']).dropna().drop('quality_scores_bad', axis=1).reset_index(drop=True)
store.put('prealn/quality_scores_bad', srs, data_columns=True, format='table')

### Abi Solid

In [4]:
dat = get_vals('', func=check_solid)
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'abi_solid']).dropna().drop('abi_solid', axis=1).reset_index(drop=True)
store.put('prealn/abi_solid', srs, data_columns=True, format='table')

In [31]:
bob = store['layout'].head(3)

In [35]:
mask = bob.index.isin(store['layout'].index)

In [40]:
store.append('layout', bob[~mask], data_columns=True, format='t')

In [41]:
store['layout'].shape

(28823,)