# Developmental Time Course

In [115]:
import os
import sys
from pathlib import Path
from textwrap import dedent

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-04-30 
Git hash: 69e627d87b83358b44de6959a32570f5edba2b8c


In [4]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sra']
ncbi = db['ncbi']
biometa = db['biometa']

In [51]:
# Get list of completed srx
complete = store['aln/complete'].srx.unique().tolist()

In [52]:
rnaseq = [x['_id'] for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': complete},
            'sra.experiment.library_strategy': 'RNA-Seq'
        }
    },
    {
        '$project': {
            '_id': 1
        }
    }
])]

In [10]:
names = set([x['name'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$sample_attributes'
        }
    },
    {
        '$match': {
            'sample_attributes.name': {'$exists': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'name': '$sample_attributes.name'
        }
    }
])])

In [13]:
# List some possible names to use
"age"
"age_of_animals"
"age_of_flies"
"age_of_fly_in_days_post_eclosion"
"ages"
"biosamplemodel"
"biosourceprovider"
"biosourcetype"
"days_after_eclosion"
"developemntal_stage"
"developmentalstage"
"developmental_stage"
"developmental_temperature"
"developmental_time"
"developmental_time_point"
"development_point"
"development_stage"
"develpomental_stage"
"dev_stage"
"dev-stage"
"embryonic_stage"
"embryonic_stages"
"embryonic_temperature"
"embryos_hand-sorted_or_not"
"embryos_per_sample"
"gender/age"
"gestational_age"
"mating_status"
"mating_system"
"sex/age"
"tissue_age"
"tissue/development_stage"
"tissue_source"

In [26]:
dev_stage = [
    "developemntal_stage",
    "developmentalstage",
    "developmental_stage",
    "development_point",
    "development_stage",
    "develpomental_stage",
    "dev_stage",
    "dev-stage"
]

In [72]:
staged = list(set([x['_id'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$experiments'
        }
    },
    {
        '$match': {
            'experiments.srx': {'$in': rnaseq}
        }
    },
    {
        '$unwind': {
            'path': '$sample_attributes'
        }
    },
    {
        '$match': {
            'sample_attributes.name': {'$in': dev_stage}
        }
    },
    {
        '$group': {
            '_id': '$experiments.srx'
        }
    },
    {
        '$project': {
            '_id': 1,
        }
    }
    
])]))

In [73]:
len(staged)

5602

In [111]:
ncbi.find_one({'_id': staged[0]})

{'_cls': 'Ncbi',
 '_id': 'SRX1006753',
 'bioproject': {'bioproject_accn': 'PRJNA281652',
  'bioproject_id': '281652',
  'description': 'Mating and transfer of male Sex Peptide causes a number of physiological changes in the female fly, including increased egg production, decreased immune function and shortened life span. A panel of wild-type- derived strains of Drosophila melanogaster was used to assay the transcriptional changes in response to mating. The data represents whole transcriptome profiling of female heads before (Virgin) and after mating (Mated). RNA-Seq data was generated for the F1-heterozygous progeny resulting from crosses between inbred wild-type- derived strains and a common tester strain w[1118]. Virgin females from naturally-derived strains were crossed to males of the w[1118] tester strain to generate hybrid virgin female progeny. Half of these hybrid females were assayed as virgins at age 10 days, whereas the other half were mated to w[1118] males at a ratio of 30

In [112]:
df = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': staged},
        }
    },
    {
        '$unwind': {
            'path': '$biosample'
        }
    },
    {
        '$unwind': {
            'path': '$biosample.attributes'
        }
    },
    {
        '$match': {
            'biosample.attributes.name': {'$in': dev_stage}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'attribute_value': '$biosample.attributes.value',
            'title': {'$arrayElemAt': ['$pubmed.title', 0]},
            'abstract': {'$arrayElemAt': ['$pubmed.abstract', 0]},
            'pmid': {'$arrayElemAt': ['$pubmed.pubmed_id', 0]},
        }
    }
])))

In [117]:
with open('../data/dev_stage_corpus.yaml', 'w') as fh:
    for x in sorted(df.attribute_value.unique().tolist()):
        fh.write(dedent(
            f"""\
            MAP:
                id: 
                synonym: {x}
            """))

In [None]:
df.sort_values('attribute_value').to_csv('../data/dev_stage.')

In [97]:
df = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': staged},
            'pubmed': {'$exists': 1}
        }
    },
    {
        '$unwind': {
            'path': '$biosample'
        }
    },
    {
        '$unwind': {
            'path': '$biosample.attributes'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'attribute_name': '$biosample.attributes.name',
            'attribute_value': '$biosample.attributes.value',
            'title': {'$arrayElemAt': ['$pubmed.title', 0]},
            'abstract': {'$arrayElemAt': ['$pubmed.abstract', 0]},
            'pmid': {'$arrayElemAt': ['$pubmed.pubmed_id', 0]},
        }
    }
])))

In [107]:
with open('/tmp/names', 'w') as fh:
    for i, gd in df.groupby('attribute_name'):
        fh.write(i + '\n')
        for x in sorted(gd.attribute_value.unique().tolist()):
            fh.write('\t' + x + '\n')