# Build Example Metadata Table

In [3]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra_new.h5', mode='r')

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-02-23 
Git hash: f4aaace3e5b64fd8cddf3bc02a6b2c7d196089d7


In [4]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sramongo']
ncbi = db['ncbi']


In [10]:
# get list of complete

srxs = store['prealn/complete'].srx.unique().tolist()

In [129]:
col_mapper = {
    'tissue': [
        'OrganismPart',
        'midgut part',
        'midgut region',
        'tissue',
        'tissue age',
        'tissue source',
        'tissue subtype',
        'tissue type',
        'tissue-type',
        'tissue/cell type',
        'tissue/development stage',
        'tissue_type',
        
    ],
    'dev_stage': [
        'DevelopmentalStage',
        'Stage',
        'dev-stage',
        'dev_stage',
        'developemntal stage',
        'development point',
        'development stage',
        'developmental stage',
        'developmental time point',
        'embryonic stage',
        'stage',
        
    ],
    'sex': [
        'Sex',
        'gender',
        'gender type',
        'gender/age',
        'sex',
        'sex type',
        'sex/age',
        
    ],
    'cell_type': [
        'CellLine',
        'CellType',
        'cell line', 
        'cell line background',
        'cell line source',
        'cell type',
        'cell-type',
        'cell_line',
        'cell_type',
        'celll line',
        'cells derived from',
        
        
    ]
}

col_mapper_rev = {}
for k, values in col_mapper.items():
    for value in values:
        col_mapper_rev[value] = k

In [147]:
res = list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': srxs}
        }
    },
    {
        '$project': {
            '_id': 0,
            'SRX': '$srx',
            'SRR': '$runs.srr',
            'SRP': "$sra.experiment.study_id",
            'BioProject': '$bioproject.bioproject_accn',
            'BioSample': '$biosample.biosample_accn',
            'GEO': "$biosample.GEO",
            'Pubmed': '$pubmed.pubmed_id',
            'title': '$pubmed.title',

            'library_strategy': '$sra.experiment.library_strategy',
            'attributes': '$biosample.attributes',
            'attributes2': '$sra.sample.attributes',
        }
    },
]))

def flatten(x, key):
    try:
        if x[key] == np.nan:
            x[key] = ''
        
        elif isinstance(x[key], str):
            return
            
        elif x[key] == []:
             x[key] = ''
                
        elif len(x[key]) == 1:
             x[key] = x[key][0]
                
    except KeyError:
        x[key] = ''

def map_attrs(x):
    defaults = {
        'tissue': '',
        'dev_stage': '',
        'sex': '',
        'cell_type': '',
    }
    
    try:
        for _attr in x['attributes'][0]:
            name = _attr['name']
            value = _attr['value']
            normName = col_mapper_rev.get(name, name)
            if normName in defaults.keys():
                defaults[normName] = value
    except KeyError:
        pass
            
    try:
        for _attr in x['attributes2']:
            name = _attr['name']
            value = _attr['value']
            normName = col_mapper_rev.get(name, name)
            if normName in defaults.keys():
                defaults[normName] = value
    except KeyError:
        pass

    x.update(defaults)
    
    
for _res in res:
    flatten(_res, 'GEO')
    flatten(_res, 'BioSample')
    flatten(_res, 'Pubmed')
    flatten(_res, 'title')
    map_attrs(_res)

In [148]:
df = pd.DataFrame(res).drop(['attributes', 'attributes2'], axis=1)

In [155]:
df[df.cell_type.str.contains('S2')]

Unnamed: 0,BioProject,BioSample,GEO,Pubmed,SRP,SRR,SRX,cell_type,dev_stage,library_strategy,sex,tissue,title
220,PRJEB2754,SAMEA1464075,,,ERP001006,[ERR055260],ERX032305,S2,,ChIP-Seq,,,
264,PRJEB3031,SAMEA1487331,,,ERP001439,[ERR126230],ERX102366,S2,,ChIP-Seq,,,
265,PRJEB3031,SAMEA1487332,,,ERP001439,[ERR126231],ERX102367,S2,,ChIP-Seq,,,
267,PRJEB3031,SAMEA1487319,,,ERP001439,[ERR126232],ERX102369,S2,,ChIP-Seq,,,
279,PRJEB3031,SAMEA1487330,,,ERP001439,[ERR126229],ERX102381,S2,,ChIP-Seq,,,
430,PRJEB13075,SAMEA3903388,,,ERP014611,[ERR1331728],ERX1403348,S2,,ChIP-Seq,,,
431,PRJEB13075,SAMEA3903389,,,ERP014611,[ERR1331729],ERX1403349,S2,,ChIP-Seq,,,
432,PRJEB13075,SAMEA3903390,,,ERP014611,[ERR1331730],ERX1403350,S2,,ChIP-Seq,,,
433,PRJEB13075,SAMEA3903391,,,ERP014611,[ERR1331731],ERX1403351,S2,,ChIP-Seq,,,
1102,PRJEB670,SAMEA1573979,,,ERP001982,[ERR198935],ERX173561,S2,,RNA-Seq,,,


In [96]:
names = []
for _res in res:
    try:
        for _attr in _res['attributes'][0]:
            names.append(_attr['name'])
    except KeyError:
        pass
        
    try:
        for _attr in _res['attributes2']:
            names.append(_attr['name'])
    except KeyError:
        pass

In [105]:
def get_vals(names):
    vals = []
    for _res in res:
        try:
            for _attr in _res['attributes'][0]:
                if _attr['name'] in names:
                    vals.append(_attr['value'])
        except KeyError:
            pass

        try:
            for _attr in _res['attributes2']:
                if _attr['name'] in names:
                    vals.append(_attr['value'])
        except KeyError:
            pass
    return set(vals)

In [110]:
dev = get_vals(col_mapper['tissue'])

In [111]:
dev

{'0-16h embryos',
 '0.5-2.5hr embryo',
 '12 hours larva',
 '24hr embryo',
 '3rd instar larvae',
 '3rd instar larvae wing discs',
 '4C_Growth_Protocol',
 'Accessory Gland',
 'Adult abdominal fatbody with associated abdominal cuticle, oenocytes and cuticular muscles',
 'Adult female ovary, no treatment, no GAL4 activation',
 'Adult fly heads',
 'Adult ovaries',
 'AdultFemale',
 'AdultMale',
 'BRAINS',
 'Body',
 'Body (head removed)',
 'Body tissue',
 'Brain',
 'CNS-derived cell-line',
 'Carcass',
 'Central Nervous System',
 'Developmental Stage Embryo 12-24h',
 'Digestive System',
 'Dorsal Organ Ganglia',
 'E12-16',
 'E4-8',
 'Early glial cells',
 'Embryo',
 'Embryo 20-24h',
 'Embryo-derived cell-line',
 'Embryos',
 'Eye-Antennal disc',
 'Eye/antennal imaginal discs',
 'FACS Sorted midgut cells',
 'Fat',
 'Fly Heads',
 'Full heads',
 'Head',
 'Head and Thorax',
 'Head-less body',
 'Heads',
 'Imaginal Disc',
 'Imaginal discs',
 'Imaginal discs and salivary glands',
 'Immortalized cells',
