# Reset Pre-Aln Store

A while ago I did some restructuring of the data store. I am running into some residual issues that I need to figure out and fix now.

In [1]:
import os
import sys
import re
import gzip

import numpy as np
import pandas as pd
from dask import delayed, compute
from dask.distributed import Client

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='a')

client = Client()

last updated: 2018-02-14 
Git hash: 95602bf685870de470e3ae5acec5a8111e0b6da7


In [3]:
# Which things to rebuild
LAYOUT = False
STRAND = False
DOWNLOAD_BAD = False
QUALITY_BAD = False
ALIGNMENT_BAD = False
ABI_SOLID = False

In [4]:
store.root

/ (RootGroup) ''
  children := ['aln' (Group), 'ids' (Group), 'layout' (Group), 'prealn' (Group), 'strand' (Group)]

In [5]:
store.root.prealn

/prealn (Group) ''
  children := ['abi_solid' (Group), 'alignment_bad' (Group), 'complete' (Group), 'download_bad' (Group), 'flags' (Group), 'qc_passed' (Group), 'quality_scores_bad' (Group), 'queue' (Group), 'spearman' (Group), 'workflow' (Group)]

In [6]:
store.root.prealn.workflow

/prealn/workflow (Group) ''
  children := ['bamtools_stats' (Group), 'collectrnaseqmetrics' (Group), 'fastq' (Group), 'fastq_screen' (Group), 'feature_counts' (Group), 'hisat2' (Group), 'markduplicates' (Group), 'merge' (Group), 'samtools_idxstats' (Group), 'samtools_stats' (Group)]

In [7]:
ids = store['ids']

In [8]:
def check_file(fname):
    with open(fname, 'r') as fh:
        return fh.read().strip()

def get_val(srx, srr, pattern):
    try:
        fname = pattern
        return [srx, srr, check_file(fname.format(srx=srx, srr=srr))]
    except FileNotFoundError:
        return [srx, srr, 'Missing']

def get_vals(pattern):
    futures = []
    for idx, (srx, srr) in ids.iterrows():
        futures.append(delayed(get_val)(srx, srr, pattern))

    res = client.compute(futures)
    return client.gather(res)

To track status go [here](http://localhost:8787)

## Set Layout

I noticed that somehow Layout and Strand are the same table. I am going to go head and re-build these datasets in the queue.

In [9]:
if LAYOUT:
    dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/LAYOUT')
    srs = pd.DataFrame(dat, columns=['srx', 'srr', 'layout']).set_index(['srx', 'srr']).iloc[:, 0]
    store['layout'] = srs

## Set Strand

In [10]:
if STRAND:
    dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/STRAND')
    srs = pd.DataFrame(dat, columns=['srx', 'srr', 'strand']).set_index(['srx', 'srr']).iloc[:, 0]
    store['strand'] = srs

## Set Alignment Bad

In [11]:
if ALIGNMENT_BAD:
    dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/ALIGNMENT_BAD')
    srs = pd.DataFrame(dat, columns=['srx', 'srr', 'alignment_bad']).set_index(['srx', 'srr']).iloc[:, 0]
    store['prealn/alignment_bad'] = srs.replace(to_replace='Missing', value=False).replace('', True)

## Set Download Bad

In [12]:
if DOWNLOAD_BAD:
    dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/DOWNLOAD_BAD')
    srs = pd.DataFrame(dat, columns=['srx', 'srr', 'download_bad']).set_index(['srx', 'srr']).iloc[:, 0]
    store['prealn/download_bad'] = srs.replace(to_replace='Missing', value=False).replace('', True)

## Set Quality Scores Bad

In [13]:
if QUALITY_BAD:
    dat = get_vals('../output/prealn-wf/samples/{srx}/{srr}/QUALITY')
    srs = pd.DataFrame(dat, columns=['srx', 'srr', 'quality_scores_bad']).set_index(['srx', 'srr']).iloc[:, 0]
    store['prealn/quality_scores_bad'] = srs.replace(to_replace='Missing', value=False).replace('', True)

## Set ABI SOLID

In [14]:
def check_solid(srx, srr):
    try:
        layout = check_file(f'../output/prealn-wf/samples/{srx}/{srr}/LAYOUT')
        if layout == 'SE' or layout == 'PE' or layout == 'keep_R1':
            fname = f'../output/prealn-wf/samples/{srx}/{srr}/{srr}_1.fastq.gz'
        else:
            fname = f'../output/prealn-wf/samples/{srx}/{srr}/{srr}_2.fastq.gz'
        
        with gzip.open(fname, 'rt') as fh:
            _, l = fh.readline(), fh.readline()
            if l.startswith('T'): 
                return srx, srr, True
            
    except FileNotFoundError:
        return srx, srr, 'Missing'
    
    return srx, srr, False

In [15]:
if ABI_SOLID:
    futures = []
    for i, (srx, srr) in ids.iterrows():
        futures.append(delayed(check_solid)(srx, srr))

    res = client.gather(client.compute(futures))
    _dd = pd.DataFrame(res, columns=['srx', 'srr', 'abi_solid'])
    store['prealn/abi_solid'] = _dd.replace({'Missing': False}).set_index(['srx', 'srr']).iloc[:, 0]

## Summary

In [16]:
store['layout'].value_counts()

SE         20338
PE          7888
Missing     6484
keep_R1      536
keep_R2      117
Name: layout, dtype: int64

In [17]:
store['strand'].value_counts()

unstranded         18808
Missing             8001
opposite_strand     5482
same_strand         3072
Name: strand, dtype: int64

In [18]:
store['prealn/download_bad'].value_counts()

False    35082
True       281
Name: download_bad, dtype: int64

In [19]:
store['prealn/quality_scores_bad'].value_counts()

False    35359
True         4
Name: quality_scores_bad, dtype: int64

In [20]:
store['prealn/abi_solid'].value_counts()

False    32389
True      2974
Name: abi_solid, dtype: int64

In [21]:
store['prealn/alignment_bad'].value_counts()

False    33006
True      2357
Name: alignment_bad, dtype: int64

In [22]:
store.close()