I need to create a dmel atlas sample table for running the RNA-seq pipeline. This atlas is data from Haiwang's project, but I have downloaded some of it as part of the SRA. 

In [1]:
# %load ../start.py
# Imports
import os
import sys
from tempfile import TemporaryDirectory
import re

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../../lib')
from larval_gonad.notebook import Nb

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

import GEOparse
import Bio.Entrez as Entrez

Entrez.email = nbconfig.email

last updated: 2017-11-22 
Git hash: 6736e3124c8b7140a2310fe6a237ad3fd2f393b7


In [2]:
def get_srr(x):
    res = Entrez.efetch('sra', id=x).read()
    return list(set(re.findall('SRR\d+', res)))

In [3]:
nbconfig

nb_name:	None
project_dir:	/spin1/users/fearjm/Projects/larval_gonad
config_dir:	/spin1/users/fearjm/Projects/larval_gonad/config
fig_dir:	/spin1/users/fearjm/Projects/larval_gonad/output/figures
table_dir:	/spin1/users/fearjm/Projects/larval_gonad/output/tables
cache:	/spin1/users/fearjm/Projects/larval_gonad/output/cache
formats:	['png', 'pdf']
styles:	['notebook', 'paper', 'talk', 'poster']
date:	2017-11-22
author:	Justin M Fear
email:	justin.m.fear@gmail.com
project:	Larval Gonad
git:	https://github.com/jfear/larval_gonad

In [4]:
ATLAS = 'GSE99574'

In [5]:
# Download GEO entry
tmp = TemporaryDirectory()
gse = GEOparse.get_GEO(ATLAS, destdir=tmp.name, silent=True)

In [6]:
data = []
for gsm, dat in gse.gsms.items():
    record = {}
    if dat.metadata['organism_ch1'][0] == 'Drosophila melanogaster':
        record['samplename'] = dat.metadata['title'][0]
        if 'leftover' in record['samplename']:
            continue
            
        if 'ercc' in record['samplename']:
            continue
        
        for attr in dat.metadata['characteristics_ch1']:
            if 'strain' in attr:
                record['strain'] = re.match(r'.*(w1118|Oregon-R).*', attr).groups()[0]
            elif 'Sex' in attr:
                record['sex'] = re.match(r'.*(Male|Female).*', attr).groups()[0].lower()
            elif 'replicate' in attr:
                record['replicate'] = re.match(r'.*(\d+).*', attr).groups()[0]
            elif 'tissue' in attr:
                record['tissue'] = re.match(r'tissue: (.*)', 
                                            attr).groups()[0].replace(' ', '_')
            elif 'plate and well id' in attr:
                groups = re.match('.*Plate(\d)_([A-Z])(\d)', attr).groups()
                record['plate'] = groups[0]
                record['row'] = groups[1]
                record['col'] = groups[2]
        for rel in dat.metadata['relation']:
            if 'SRA' in rel:
                record['srx'] = re.match('.*(SRX\d+).*', rel).groups()[0]
                record['srr'] = get_srr(record['srx'])
        data.append(record)
        
header = [
    'samplename',
    'srx',
    'srr',
    'sex',
    'strain',
    'tissue',
    'replicate',
    'plate',
    'row',
    'col'
]
df = pd.DataFrame(data)[header]

# rename gonad to ovary or testes
df.loc[(df.sex == 'female') & (df.tissue == 'gonad'), 'tissue'] = 'ovary'
df.loc[(df.sex == 'male') & (df.tissue == 'gonad'), 'tissue'] = 'testes'

In [7]:
# unwind SRR
rows = []
for _, row in df.iterrows():
    for srr in set(row.srr):
        curr = row.copy()
        curr['srr'] = srr
        curr['samplename'] = curr['samplename'] + '_' + srr
        rows.append(curr)
        
sampletable = pd.concat(rows, axis=1).T

In [8]:
# Add group for lcdb-wf
sampletable['group'] = sampletable['sex'] + '_' + sampletable['tissue']

In [9]:
sampletable.to_csv('../../dmel-atlas-wf/config/sampletable.tsv', sep='\t', index=False)