# Testing Migrating Old Data

I have re-factored the the prealignment workflow. I need to move the old data over so I want to test what all needs done. For this notebook I am going to focus on a single sample that I have manually copied over.

In [2]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-10-20 
Git hash: 6b527ce9b88199c4f8ea2cfccb60c05c5b35093b


In [4]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']

In [59]:
from dask import delayed, compute
from dask.diagnostics import ProgressBar

In [75]:
srx = 'SRX287752'
srr = 'SRR869941'

There are two TSVs that are now created by the pipeline instead of dumping to the database:

* {SRR}.fastq.tsv
* {SRR}.hisat2.bam.tsv

## Create FASTQ tsv

Output of the table looks like this:

md5_R1 | libsize_R1 | avgLen_R1 | md5_R2 | libsize_R2 | avgLen_R2
------ | ---------- | --------- | ------ | ---------- | ---------
8a82f3f5ef231d8653702d5fb14e3b99 | 12752083 | 101.0 | | |

In [19]:
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.md5': {'$exists': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
            'md5_R1': {
                '$ifNull': [
                    '$runs.md5.R1',
                    np.nan
                ]
            },
            'libsize_R1': {
                '$ifNull': [
                    '$runs.libsize.R1',
                    np.nan
                ]
            },
            'avgLen_R1': {
                '$ifNull': [
                    '$runs.avgReadLen.R1',
                    np.nan
                ]
            },
            'md5_R2': {
                '$ifNull': [
                    '$runs.md5.R2',
                    np.nan
                ]
            },
            'libsize_R2': {
                '$ifNull': [
                    '$runs.libsize.R2',
                    np.nan
                ]
            },
            'avgLen_R2': {
                '$ifNull': [
                    '$runs.avgReadLen.R2',
                    np.nan
                ]
            },
        }
    }
])))

In [20]:
df.set_index(['srx', 'srr'], inplace=True)

In [28]:
cols = ['md5_R1', 'libsize_R1',  'avgLen_R1',  'md5_R2', 'libsize_R2',  'avgLen_R2']
fastq = df.loc[~df.isnull().all(axis=1), cols].copy()

In [71]:
skipped = []
done = []
def make_fastq(srx, srr, df):
    fname = '../../output/prealignment/raw/{srx}/{srr}/{srr}.fastq.tsv'.format(srx=srx, srr=srr)
    try:
        df.to_frame().T.to_csv(fname, sep='\t', index=False)
        done.append((srx, srr))
    except FileNotFoundError:
        skipped.append((srx, srr))

In [72]:
dfs = [delayed(make_fastq)(srx, srr, df) for (srx, srr), df in fastq.iterrows()]

In [73]:
with ProgressBar():
    compute(*dfs)

[########################################] | 100% Completed |  2min 43.7s


In [74]:
len(done), len(skipped)

(24452, 3163)

## Create Hisat2 Summary tsv

num_reads | num_reads_unpaired | num_unaligned | num_uniquely_aligned | num_multimappers | per_alignment
--------- | ------------------ | ------------- | -------------------- | ---------------- | -------------
12752083 | 12752083 | 2554251 | 8260350 | 1937482 | 79.97

In [115]:
ids = [(x['srx'], x['srr']) for x in remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_workflow.hisat2': {'$exists': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr',
        }
    }
])]

In [94]:
from pathlib import Path
from ncbi_remap.parser import parse_hisat2

In [109]:
skipped = []
done = []
bad = []
def parse(srx, srr):
    try:
        fn='../../output/prealignment/raw/{srx}/{srr}/{srr}.hisat2.bam.log'.format(srx=srx, srr=srr)
        dd = parse_hisat2(srr, fn)
        output='../../output/prealignment/raw/{srx}/{srr}/{srr}.hisat2.bam.tsv'.format(srx=srx, srr=srr)
        dd.to_csv(output, sep='\t', index=False)

        if dd.ix[0, 'per_alignment'] < .50:
            fn='../../output/prealignment/raw/{srx}/{srr}/ALIGNMENT_BAD'.format(srx=srx, srr=srr)
            bad.append((srx, srr))
            Path(fn).touch()
            
        done.append((srx, srr))
    except FileNotFoundError:
        skipped.append((srx, srr))

In [110]:
dfs = [delayed(parse)(srx, srr) for srx, srr in ids]

In [111]:
with ProgressBar():
    compute(*dfs)

[########################################] | 100% Completed |  2min 44.6s


In [112]:
len(done), len(skipped)

(24419, 14)

In [114]:
len(bad)

2228

In [116]:
dbBad = [(x['srx'], x['srr']) for x in remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])]

In [122]:
len([x for x in bad if x in dbBad])

2228

In [126]:
problems = [x for x in dbBad if x not in bad]

In [138]:
bob = np.array(problems)[:, 1].tolist()

In [140]:
list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.srr': {'$in': bob}
        }
    },
    {
        '$project': {
            'flags': '$runs.pre_aln_flags'
        }
    }
]))

[{'_id': 'SRX886177', 'flags': ['SE', 'alignment_bad', 'unstranded']},
 {'_id': 'SRX765645', 'flags': ['SE', 'alignment_bad']},
 {'_id': 'SRX765644', 'flags': ['SE', 'alignment_bad', 'opposite_strand']},
 {'_id': 'SRX765642', 'flags': ['SE', 'alignment_bad', 'same_strand']},
 {'_id': 'SRX765641', 'flags': ['SE', 'alignment_bad', 'same_strand']},
 {'_id': 'SRX765640', 'flags': ['SE', 'alignment_bad', 'opposite_strand']},
 {'_id': 'SRX751576', 'flags': ['SE', 'alignment_bad', 'opposite_strand']},
 {'_id': 'SRX679372', 'flags': ['SE', 'alignment_bad', 'opposite_strand']},
 {'_id': 'SRX679372', 'flags': ['SE', 'alignment_bad', 'opposite_strand']},
 {'_id': 'SRX679369', 'flags': ['SE', 'alignment_bad']},
 {'_id': 'SRX679368', 'flags': ['SE', 'alignment_bad']},
 {'_id': 'SRX468097', 'flags': ['SE', 'alignment_bad']},
 {'_id': 'SRX156291', 'flags': ['SE', 'alignment_bad']},
 {'_id': 'SRX029216', 'flags': ['SE', 'keep_R1', 'alignment_bad']}]

In [141]:
store = pd.HDFStore('../../sra.h5')

In [145]:
problems = pd.DataFrame(problems, columns=['srx', 'srr'])

In [146]:
from ncbi_remap.io import remove_chunk, add_table

In [150]:
print(store['prealn/queue'].shape[0])
add_table(store, 'prealn/queue', data=problems)
print(store['prealn/queue'].shape[0])

10643
10643


In [155]:
print(store['prealn/alignment_bad'].shape[0])
remove_chunk(store, 'prealn/alignment_bad', problems.srr)
print(store['prealn/alignment_bad'].shape[0])

2242
2228


In [166]:
for i, row in problems.iterrows():
    srx, srr = row.srx, row.srr
    #!rm ../../output/prealignment/raw/{srx}/{srr}/{srr}*bam*