# Paul's List

This document makes a list of golden samples to send to Paul for running read indexing.

In [1]:
# %load ../start2.py
# Imports
import os
import sys

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../../lib/python')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.prealn_wf import libsize, mappability, strandedness

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

# Connect to data store
store = pd.HDFStore('../../output/sra.h5', mode='r')


last updated: 2017-11-17 
Git hash: d93d3e9dec278c9175123b44f16939fc03a6ef06


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']

In [3]:
# Grab only srx that have a single srr
complete = store['prealn/complete'].copy()
srr_cnts = complete.groupby('srx').count()
singletons = srr_cnts[(srr_cnts == 1).srr].index.tolist()
srr_single = complete[complete.srx.isin(singletons)].srr.tolist()

In [4]:
# grab well stranded singletons
first, second, unstranded = strandedness(store, cutoff=.95, keep_srrs=srr_single)
first.shape, second.shape, unstranded.shape

((1751, 2), (3643, 2), (9369, 2))

In [5]:
# grab files with at least 1million reads that are well second stranded
big = libsize(store, cutoff=1e6, keep_srrs=second.srr.tolist())

In [6]:
# Make sure at least 85% mapping
mapped = mappability(store, cutoff=.85, keep_srrs=big.srr.tolist())

In [7]:
# grab samples annotated as rna-seq

In [8]:
ncbi.find_one({}, {'sra.experiment.library_strategy': 1})

{'_id': 'SRX2581987', 'sra': {'experiment': {'library_strategy': 'OTHER'}}}

In [9]:
final = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            'sra.experiment.library_strategy': 'RNA-Seq',
        }
    },
    {'$unwind': '$sra.run'},
    {
        '$match': {
            'sra.run.run_id': {'$in': mapped.srr.tolist()},
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$sra.run.run_id'
        }
    }
])))
print(final.shape)
final.head()

(3474, 2)


Unnamed: 0,srr,srx
0,SRR5234256,SRX2541782
1,SRR5234255,SRX2541781
2,SRR5234254,SRX2541780
3,SRR5234253,SRX2541779
4,SRR5234252,SRX2541778


In [10]:
final[['srx', 'srr']].to_csv('../../output/dmel_stranded_test_set.tsv', sep='\t', index=False)