# Fly Meeting Abstract Prep

I am working on preparing the Fly Meeting Abstract and need some details.

In [3]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [4]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-11-15 
Git hash: 58aa35e5aea674c7af407cd79cf6acd98196f546


In [5]:
# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

In [6]:
from pymongo import MongoClient
host = 'localhost'
mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [8]:
rnaseq = pd.read_csv('../output/geo-wf/rnaseq_metadata.tsv', sep='\t', index_col=0)

In [16]:
num_pub = rnaseq.shape[0]

In [12]:
mod = pd.read_csv('../output/modENCODE_rnaseq_sampletable.tsv', sep='\t')

In [21]:
_mul = num_pub / len(mod.srx.unique())
print(f'There is {_mul:.0f} times at much data than modENCODE')

There is 42 times at much data than modENCODE


In [75]:
ncbi.find_one({})

{'_id': 'SRX4104113',
 '_cls': 'Ncbi',
 'sra': {'submission': {'submission_id': 'SRA707623',
   'external_id': [],
   'secondary_id': [],
   'submitter_id': [],
   'uuid': []},
  'organization': {'organization_type': 'institute',
   'name': 'Southern University of Science and Technology',
   'email': 'shenwei4907@foxmail.com',
   'first_name': 'wei',
   'last_name': 'shen'},
  'study': {'study_id': 'SRP148502',
   'BioProject': 'PRJNA470784',
   'external_id': [],
   'secondary_id': [],
   'submitter_id': [],
   'uuid': [],
   'title': 'Drosophila melanogaster PCR-free Hi-C Raw sequence reads',
   'study_type': 'Whole Genome Sequencing',
   'abstract': 'We modified on in situ Hi-C and developed new method which produces adequate DNA for direct High-throughput sequeuncing. Systematic comparison between non-amplified and amplified Hi-Cs were carried out.',
   'center_name': 'BioProject',
   'center_project_name': 'Drosophila melanogaster strain:S2',
   'related_studies': [],
   'url_link

In [87]:
pe_srxs = layout[(layout == 'PE')].index.get_level_values('srx').unique().tolist()
se_srxs = layout[(layout == 'SE')].index.get_level_values('srx').unique().tolist()
k1_srxs = layout[(layout == 'keep_R1')].index.get_level_values('srx').unique().tolist()
k2_srxs = layout[(layout == 'keep_R2')].index.get_level_values('srx').unique().tolist()

In [97]:
pe = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': pe_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            's2': '$sra.run.read_count_r2',
            'l1': '$sra.run.read_len_r1',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

pe.set_index('_id', inplace=True)

pe_size = pe.s1.sum()
pe_bases = ((pe.l1 + pe.l2) * pe.s1).sum()

In [101]:
se = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': se_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

se.set_index('_id', inplace=True)

se_size = se.s1.sum()
se_bases = (se.l1 * se.s1).sum()

In [103]:
k1 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k1_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

k1.set_index('_id', inplace=True)

k1_size = k1.s1.sum()
k1_bases = (k1.l1 * k1.s1).sum()

In [104]:
k2 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k2_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's2': '$sra.run.read_count_r2',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

k2.set_index('_id', inplace=True)

k2_size = k2.s2.sum()
k2_bases = (k2.l2 * k2.s2).sum()

351029884742.0

In [124]:
total_size = pe_size + se_size + k1_size + k2_size
print(f'{total_size / 1e9:,.0f} billion reads')

696 billion reads


In [126]:
total_bases = pe_bases + se_bases + k1_bases + k2_bases
print(f'{total_bases / 1e12:,} tera bases')

74.90210148208793 tera bases


In [127]:
bob = (260053, 452600, 18551)

In [129]:
bob / np.sum(bob)

array([0.35565, 0.61898, 0.02537])