# Building Golden

This notebook I am wanting to build a golden set of samples after assembling all of my flags.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-10-11 
Git hash: 8c83e87b7c4eac097d2ea2f50eee0e3a81393eaa


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']


In [3]:
# Get list of all srx, srr
df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

In [4]:
# Get list of srx that have completed pre-alignment
_df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'complete'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

df['flag_complete'] = False
df.loc[df.srr.isin(_df.srr), 'flag_complete'] = True

In [5]:
# download bad
_df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'download_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

df['flag_download_bad'] = False
df.loc[df.srr.isin(_df.srr), 'flag_download_bad'] = True

In [6]:
# alignment bad
_df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'alignment_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

df['flag_alignment_bad'] = False
df.loc[df.srr.isin(_df.srr), 'flag_alignment_bad'] = True

In [7]:
# abi solid
_df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'abi_solid'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

df['flag_abi'] = False
df.loc[df.srr.isin(_df.srr), 'flag_abi'] = True

In [8]:
# FASTQ file malformed
_df = pd.DataFrame(list(remap.aggregate([
    {'$unwind': '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'quality_scores_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$runs.srr'
        }
    }
])))

df['flag_malformed'] = False
df.loc[df.srr.isin(_df.srr), 'flag_malformed'] = True

In [14]:
df.set_index(['srx', 'srr'], inplace=True)

In [15]:
libsize = pd.read_pickle('../../output/libsize_downstream_analysis.pkl')

In [17]:
contamination = pd.read_pickle('../../output/contamination_downstream_analysis.pkl')

In [18]:
correlation = pd.read_pickle('../../output/correlation_downstream_analysis.pkl')

In [19]:
mappability = pd.read_pickle('../../output/mapability_downstream_analysis.pkl')

In [21]:
merged = df.join([libsize, contamination, correlation, mappability])

In [25]:
merged.filter(regex=r'flag*').columns

Index(['flag_complete', 'flag_download_bad', 'flag_alignment_bad', 'flag_abi',
       'flag_malformed', 'flag_low_libsize', 'flag_short_read_len',
       'flag_wolbachia', 'flag_hg19', 'flag_yeast', 'flag_ecoli', 'flag_ercc',
       'flag_missing_counts', 'flag_singleton', 'flag_doubleton', 'flag_multi',
       'flag_drop_corr', 'flag_low_unique_alignment'],
      dtype='object')

In [None]:
pd.read_