# Fix Alignment Workflow Store

In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from dask import delayed
from dask.distributed import Client

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.parser import parse_hisat2

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='a')
client = Client()

last updated: 2018-02-14 
Git hash: 95602bf685870de470e3ae5acec5a8111e0b6da7


In [8]:
ADD_FILES = False

In [2]:
store.root

/ (RootGroup) ''
  children := ['aln' (Group), 'ids' (Group), 'layout' (Group), 'prealn' (Group), 'strand' (Group)]

In [3]:
store.root.aln

/aln (Group) ''
  children := ['alignment_bad' (Group), 'complete' (Group), 'queue' (Group)]

In [None]:
ids = store['ids']

## Add ALIGNMENT_BAD files

Found that the workflow was not adding the ALIGNMENT_BAD files, so I fixed the problem in the workflow and added this to fill in the gaps.

In [13]:
def check_log(srx, srr):
    pattern = f'../output/aln-wf/samples/{srx}/{srr}/{srr}.fq.bam.log'
    aln_bad = Path(f'../output/aln-wf/samples/{srx}/{srr}/ALIGNMENT_BAD')
    try:
        df = parse_hisat2(srx, srr, pattern)
        if df.iloc[0, :]['per_alignment'] < 0.50:
            aln_bad.touch()
            return (srx, srr, True)
    except FileNotFoundError:
        pass
    
    return (srx, srr, False)
        
def check_logs():
    futures = []
    for idx, (srx, srr) in ids.iterrows():
        futures.append(delayed(check_log)(srx, srr))

    res = client.compute(futures)
    return client.gather(res)

In [None]:
if ADD_FILES:
    log_res = check_logs()
    pd.DataFrame(log_res, columns=['srx', 'srr', 'aln_bad']).set_index(['srx', 'srr']).iloc[:, 0].value_counts()

## Check ALIGNMENT BAD

In [5]:
if ADD_FILES
log_res = check_logs()

In [6]:
dat = get_vals('../output/aln-wf/samples/{srx}/{srr}/ALIGNMENT_BAD')
srs = pd.DataFrame(dat, columns=['srx', 'srr', 'alignment_bad']).set_index(['srx', 'srr']).iloc[:, 0]
store['aln/alignment_bad'] = srs.replace(to_replace='Missing', value=False).replace('', True)

In [7]:
store['aln/alignment_bad'].value_counts()

False    35242
True       121
Name: alignment_bad, dtype: int64