# Fix Alignment Workflow Store

In [26]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from dask import delayed
from dask.distributed import Client

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.parser import parse_hisat2

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='a')
client = Client()

last updated: 2018-02-14 
Git hash: 975ee2428bfd16c0da0345c335ffaa399cba8ecd


In [27]:
store.root

/ (RootGroup) ''
  children := ['aln' (Group), 'ids' (Group), 'layout' (Group), 'prealn' (Group), 'strand' (Group)]

In [3]:
store.root.aln

/aln (Group) ''
  children := ['alignment_bad' (Group), 'complete' (Group), 'queue' (Group)]

In [4]:
ids = store['ids']

In [13]:
def check_log(srx, srr):
    pattern = f'../aln-wf/output/samples/{srx}/{srr}/{srr}.fq.bam.log'
    aln_bad = Path(f'../aln-wf/output/samples/{srx}/{srr}/ALIGNMENT_BAD')
    try:
        df = parse_hisat2(srx, srr, pattern)
        if df.iloc[0, :]['per_alignment'] < 0.50:
            aln_bad.touch()
            return (srx, srr, True)
    except FileNotFoundError:
        pass
    
    return (srx, srr, False)
        
def check_logs():
    futures = []
    for idx, (srx, srr) in ids.iterrows():
        futures.append(delayed(check_log)(srx, srr))

    res = client.compute(futures)
    return client.gather(res)

In [None]:
log_res = check_logs()

In [21]:
pd.DataFrame(log_res, columns=['srx', 'srr', 'aln_bad']).set_index(['srx', 'srr']).iloc[:, 0].value_counts()

False    35242
True       121
Name: aln_bad, dtype: int64

In [22]:
def check_file(fname):
    with open(fname, 'r') as fh:
        return fh.read().strip()

def get_val(srx, srr, pattern):
    try:
        fname = pattern
        return [srx, srr, check_file(fname.format(srx=srx, srr=srr))]
    except FileNotFoundError:
        return [srx, srr, 'Missing']

def get_vals(pattern):
    futures = []
    for idx, (srx, srr) in ids.iterrows():
        futures.append(delayed(get_val)(srx, srr, pattern))

    res = client.compute(futures)
    return client.gather(res)

In [24]:
dat = get_vals('../aln-wf/output/samples/{srx}/{srr}/ALIGNMENT_BAD')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'alignment_bad']).set_index(['srx', 'srr']).iloc[:, 0]
store['prealn/alignment_bad'] = srs.replace(to_replace='Missing', value=False).replace('', True)

In [30]:
store['prealn/alignment_bad'].value_counts()

False    35242
True       121
Name: alignment_bad, dtype: int64