# modEncode Runs

I need to get a list of modEncode runs to start making some wiggle tracks. I want to focus on runs that the Thierry-Miegs decided to analyze since I am still figuring out what setting to use.

In [2]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)

last updated: 2017-04-14 
Git hash: d016341af9f7e0bf172fb8dbe9e83813da94ce03


In [8]:
# connect to database to do some look ups
from pymongo import MongoClient

client = MongoClient(host='localhost', port=27022)
db = client['sra2']
ncbi = db['ncbi']
remap = db['remap']

In [7]:
# Get a list of modEnocde runs from modEncode DataBase
from intermine.webservice import Service
service = Service("http://intermine.modencode.org/release-32/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Submission")

# The view specifies the output columns
query.add_view("databaseRecords.accession")

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("Submission.databaseRecords.accession", "ASC")

# You can edit the constraint values below
query.add_constraint("databaseRecords.database", "=", "SRA", code = "A")
query.add_constraint("experimentType", "=", "RNA-seq", code = "B")
query.add_constraint("organism.species", "=", "melanogaster", code = "C")

# Uncomment and edit the code below to specify your own custom logic:
# query.set_logic("A and B and C")

modEncode_srr = []
modEncode_srx = []
for row in query.rows():
    curr =  row["databaseRecords.accession"]
    if curr.startswith('SRX'):
        modEncode_srx.append(curr)
    elif curr.startswith('SRR'):
        modEncode_srr.append(curr)

len(modEncode_srx), len(modEncode_srr)

(12, 598)

In [19]:
# Found this list of BioProjects in Zhenxia's notes. Going to go ahead and query for them too.
modEncodeProj = ["PRJNA63469", "PRJNA168994", "PRJNA200701", "PRJNA75285"]

In [40]:
# Query and build a list of srrs
m = ncbi.aggregate([
    {
        '$unwind': '$sra.run'
    },
    {
        '$match': {
            '$or': [
                {'sra.run.run_id': {'$in': modEncode_srr}},
                {'_id': {'$in': modEncode_srx}},
                {'bioproject.bioproject_accn': {'$in': modEncodeProj}}
            ]
        }
    },
    {
        '$project': {
            'srr': '$sra.run.run_id'
        }
    }
])

modEncodeSamples = [x['srr'] for x in m]

In [41]:
# Get a list of SRRs analyzed by Miegs
with open('../../data/13495_runs_analyzed_by_mieg.txt') as fh:
    miegs = [x.strip() for x in fh.readlines()]

In [44]:
# write a table to modEncode to analyze
with open('../../data/modEncode_srrs.txt', 'w') as fh:
    for srr in modEncodeSamples:
        if srr in miegs:
            fh.write(srr + '\n')