We have a large RNA-seq dataset from the lab that I also want to use as an example dataset for classification. These were the SRA study SRP074593.

In [2]:
# %load ../config/defaults.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-09-15 
Git hash: 5185ab6dc10f8090cb502805e2f296a8c2d0b431


In [5]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
ncbi = db['ncbi']
remap = db['remap']

In [21]:
# Query database for SRP
agg = ncbi.aggregate([
    {
        '$match': {
            'sra.study.study_id': 'SRP074593'
        }
    },
    {'$unwind': '$runs'},
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
            'attrs': '$biosample.attributes'
        }
    }
])

samples = []
for row in agg:
    for attr in row['attrs'][0]:
        if attr['name'] == 'cell_line':
            row['cell_line'] = attr['value']
        elif attr['name'] == 'RNAi target gene name':
            row['target_gene'] = attr['value']
        elif attr['name'] == 'RNAi reagent from harvard drosophila rnai screening center':
            row['drsc'] = attr['value']
    
    row['library_strategy'] = 'RNA-Seq'
    del row['attrs']
    samples.append(row)

In [26]:
# make dataframe
df = pd.DataFrame(samples)

# Re-order columns
df = df[['srr', 'srx', 'library_strategy', 'cell_line', 'drsc', 'target_gene']]

In [28]:
print(df.shape)
df.head()

(1900, 6)


Unnamed: 0,srr,srx,library_strategy,cell_line,drsc,target_gene
0,SRR3488543,SRX1750547,RNA-Seq,S2R+,DRSC27017,CG11560
1,SRR3488542,SRX1750546,RNA-Seq,S2R+,DRSC23043,CG8765
2,SRR3488541,SRX1750545,RNA-Seq,S2R+,DRSC14626,CG13624
3,SRR3488540,SRX1750544,RNA-Seq,S2R+,DRSC08148,Bgb
4,SRR3488539,SRX1750543,RNA-Seq,S2R+,DRSC37662,Taf6


In [29]:
df.to_csv('../../output/s2rnai_sampletable.tsv', sep='\t', index=False)