modENCODE may be a good reference point. They had a large variety and number of data sets. There may be some issues with this data because it is kind of old ~2012. Here I generate a sample table using the modENCODE database. I query their DB for all D. melanogaster sequencing data and use my copy of SRA to fill in SRX and SRR information. I then look at some basic summaries and output a sample sheet.

In [1]:
# %load ../config/defaults.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-09-15 
Git hash: ec5e4e30da99e64bf7cdc77a7333868aa6ba278e


In [89]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
ncbi = db['ncbi']
remap = db['remap']

# Building the table

In [183]:
# helper functions to complete srr and srx information from my database
def get_srx(srr):
    try:
        return ncbi.find_one({'sra.run.run_id': srr}, {})['_id']
    except TypeError:
        return None

def get_srr(srx):
    try:
        return [x['srr'] for x in ncbi.aggregate([
            {
                '$match': {
                    '_id': srx
                }
            },
            {'$unwind': '$sra.run'},
            {
                '$project': {
                    '_id': 0,
                    'srr': '$sra.run.run_id'
                }
            }
        ])]
    except TypeError:
        return None

def get_geo(gsm):
    try:
        return ncbi.find_one({'sra.sample.GEO': gsm}, {})['_id']
    except:
        pass
    try:
        return ncbi.find_one({'sra.study.GEO': gsm}, {})['_id']
    except:
        return None

def get_strategy(srx):
    try:
        return next(ncbi.aggregate([
            {
                '$match': {
                    '_id': srx
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'strat': '$sra.experiment.library_strategy'
                }
            }
        ]))['strat']
    except:
        return None

In [219]:
# Download modENCODE table from modMine
from intermine.webservice import Service
service = Service("http://intermine.modencode.org/release-33/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Submission")

# The view specifies the output columns
query.add_view("DCCid", "assayFactor", "experimentType", "databaseRecords.accession")

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("Submission.DCCid", "ASC")

# You can edit the constraint values below
query.add_constraint("organism.species", "=", "melanogaster", code = "A")
query.add_constraint("databaseRecords.database", "ONE OF", ["GEO", "SRA"], code = "B")
query.add_constraint("experimentType", "ONE OF", [
    "RNA-seq", "ChIP-seq", "RNA-seq, RNAi", "Computational annotation", "RIP-seq",
    "DNA-seq", "CAGE", "RACE", 
], code = "C")

# Uncomment and edit the code below to specify your own custom logic:
# query.set_logic("A")

rows = []
for row in query.rows():
    acc = row["databaseRecords.accession"].strip()
    
    if acc.startswith('SRR'):
        srrs = [acc]
        srx = get_srx(acc)
    elif acc.startswith('SRX'):
        srx = acc
        srrs = get_srr(srx)
    elif acc.startswith('GSM') or acc.startswith('GSE'):
        srx = get_geo(acc)
        srrs = get_srr(srx)
    else:
        print(acc, row['DCCid'])
        continue
    
    strat = get_strategy(srx)
    
    for srr in srrs:
        rows.append([srx, srr, row["DCCid"], row["assayFactor"], row["experimentType"], strat])
        
df = pd.DataFrame(rows, columns=['srx', 'srr', 'modENCODE_id', 'modENCODE_assay', 'modENCODE_type', 'sra_strategy'])

TMPID:GEO:3323_lane_1 modENCODE_3323
TMPID:GEO:3323_lane_2 modENCODE_3323
TMPID:GEO:3324_lane_1 modENCODE_3324
TMPID:GEO:3324_lane_2 modENCODE_3324
TMPID:GEO:3324_lane_3 modENCODE_3324
TMPID:GEO:3325_lane_1 modENCODE_3325
TMPID:GEO:3325_lane_2 modENCODE_3325
TMPID:GEO:3325_lane_3 modENCODE_3325


The above samples had strange entries for the GEO accessions. There is not much I can do with this data, so I just ignore these samples.

In [220]:
df[df.isnull().any(axis=1)]

Unnamed: 0,srx,srr,modENCODE_id,modENCODE_assay,modENCODE_type,sra_strategy
542,,SRR069507,modENCODE_3193,small-RNA,RNA-seq,
1058,,SRR124149,modENCODE_4409,total-RNA,RNA-seq,
1363,,SRR488719,modENCODE_4771,small-RNA,RNA-seq,


The above 3 samples no longer exists in NCBI's SRA database. I double checked everything through the web interface, so I am just going to ignore these samples.

In [227]:
# Create cleaned dataset
df_clean = df.dropna().drop_duplicates()
print('My table has {:,} samples from modENCODE'.format(df_clean.shape[0]))

My table has 2,311 samples from modENCODE


# Basic summary counts

## compare modENCODES assay type to SRA's.

In [223]:
pd.crosstab(df_clean.modENCODE_type, df_clean.sra_strategy, margins=True)

sra_strategy,ChIP-Seq,EST,OTHER,RNA-Seq,WGS,All
modENCODE_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CAGE,0,0,0,55,0,55
ChIP-seq,963,0,0,0,0,963
Computational annotation,0,32,68,58,0,158
DNA-seq,8,0,22,0,27,57
RACE,0,0,0,24,0,24
RIP-seq,0,0,68,0,0,68
RNA-seq,0,553,25,201,0,779
"RNA-seq, RNAi",0,0,0,207,0,207
All,971,585,183,545,27,2311


## Compare modENCODES assay Factors to SRAs assay types.

modENCODE has a collumn called assay factor. This column contains some general values like 'PolyA-RNA', it also has ChIP-seq targets that I am ignoring here.

In [228]:
factors =  df_clean.modENCODE_assay.isin(['5-prime-UTR', 'UTR', 'genomic_sequence', 'small-RNA', 'total-RNA', 'PolyA-RNA'])
pd.crosstab(df_clean[factors].modENCODE_assay, df_clean[factors].sra_strategy, margins=True)

sra_strategy,EST,OTHER,RNA-Seq,WGS,All
modENCODE_assay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5-prime-UTR,0,0,55,0,55
PolyA-RNA,0,0,24,0,24
UTR,0,0,24,0,24
genomic_sequence,0,0,0,21,21
small-RNA,5,25,58,0,88
total-RNA,568,5,140,0,713
All,573,30,301,21,925


## Output sample table.

In [225]:
df_clean.head()

Unnamed: 0,srx,srr,modENCODE_id,modENCODE_assay,modENCODE_type,sra_strategy
0,SRX002599,SRR013488,modENCODE_1040,UTR,RACE,RNA-Seq
1,SRX002600,SRR013489,modENCODE_1040,UTR,RACE,RNA-Seq
2,SRX002600,SRR013490,modENCODE_1040,UTR,RACE,RNA-Seq
3,SRX002601,SRR013491,modENCODE_1040,UTR,RACE,RNA-Seq
4,SRX002602,SRR013492,modENCODE_1040,UTR,RACE,RNA-Seq


In [229]:
df_clean.to_csv('../../output/modENCODE_sampletable.tsv', sep='\t', index=False)