# Fly Meeting Abstract Prep

I am working on preparing the Fly Meeting Abstract and need some details.

In [1]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-11-16 
Git hash: 2dab13c6a3b0c50698c294cc055129f6cbcc5d67


In [3]:
# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

In [4]:
from pymongo import MongoClient
host = 'localhost'
mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [5]:
rnaseq = pd.read_csv('../output/geo-wf/rnaseq_metadata.tsv', sep='\t', index_col=0)

In [6]:
num_pub = rnaseq.shape[0]

In [7]:
mod = pd.read_csv('../output/modENCODE_rnaseq_sampletable.tsv', sep='\t')

In [8]:
_mul = num_pub / len(mod.srx.unique())
print(f'There is {_mul:.0f} times at much data than modENCODE')

There is 42 times at much data than modENCODE


In [9]:
layout = store['layout']

In [10]:
pe_srxs = layout[(layout == 'PE')].index.get_level_values('srx').unique().tolist()
se_srxs = layout[(layout == 'SE')].index.get_level_values('srx').unique().tolist()
k1_srxs = layout[(layout == 'keep_R1')].index.get_level_values('srx').unique().tolist()
k2_srxs = layout[(layout == 'keep_R2')].index.get_level_values('srx').unique().tolist()

In [11]:
pe = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': pe_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            's2': '$sra.run.read_count_r2',
            'l1': '$sra.run.read_len_r1',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

pe.set_index('_id', inplace=True)

pe_size = pe.s1.sum()
pe_bases = ((pe.l1 + pe.l2) * pe.s1).sum()

In [12]:
se = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': se_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

se.set_index('_id', inplace=True)

se_size = se.s1.sum()
se_bases = (se.l1 * se.s1).sum()

In [13]:
k1 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k1_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

k1.set_index('_id', inplace=True)

k1_size = k1.s1.sum()
k1_bases = (k1.l1 * k1.s1).sum()

In [14]:
k2 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k2_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's2': '$sra.run.read_count_r2',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

k2.set_index('_id', inplace=True)

k2_size = k2.s2.sum()
k2_bases = (k2.l2 * k2.s2).sum()

In [15]:
total_size = pe_size + se_size + k1_size + k2_size
print(f'{total_size / 1e9:,.0f} billion reads')

696 billion reads


In [16]:
total_bases = pe_bases + se_bases + k1_bases + k2_bases
print(f'{total_bases / 1e12:,} tera bases')

74.90210148208793 tera bases


In [17]:
srxs = rnaseq.index.unique().tolist()
layout = layout.loc[(srxs, slice(None))]

In [18]:
pe_srxs = layout[(layout == 'PE')].index.get_level_values('srx').unique().tolist()
se_srxs = layout[(layout == 'SE')].index.get_level_values('srx').unique().tolist()
k1_srxs = layout[(layout == 'keep_R1')].index.get_level_values('srx').unique().tolist()
k2_srxs = layout[(layout == 'keep_R2')].index.get_level_values('srx').unique().tolist()

In [19]:
pe = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': pe_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            's2': '$sra.run.read_count_r2',
            'l1': '$sra.run.read_len_r1',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

pe.set_index('_id', inplace=True)

pe_size = pe.s1.sum()
pe_bases = ((pe.l1 + pe.l2) * pe.s1).sum()

In [20]:
se = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': se_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

se.set_index('_id', inplace=True)

se_size = se.s1.sum()
se_bases = (se.l1 * se.s1).sum()

In [21]:
k1 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k1_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

k1.set_index('_id', inplace=True)

k1_size = k1.s1.sum()
k1_bases = (k1.l1 * k1.s1).sum()

In [22]:
k2 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k2_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's2': '$sra.run.read_count_r2',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

k2.set_index('_id', inplace=True)

k2_size = k2.s2.sum()
k2_bases = (k2.l2 * k2.s2).sum()

In [23]:
total_size = pe_size + se_size + k1_size + k2_size
print(f'{total_size / 1e9:,.0f} billion reads')

275 billion reads


In [24]:
total_bases = pe_bases + se_bases + k1_bases + k2_bases
print(f'{total_bases / 1e12:,} tera bases')

28.718069278185045 tera bases


In [25]:
srxs = rnaseq[rnaseq.tissue == 'testis'].index.tolist()
layout = layout.loc[(srxs, slice(None))]

In [26]:
pe_srxs = layout[(layout == 'PE')].index.get_level_values('srx').unique().tolist()
se_srxs = layout[(layout == 'SE')].index.get_level_values('srx').unique().tolist()
k1_srxs = layout[(layout == 'keep_R1')].index.get_level_values('srx').unique().tolist()
k2_srxs = layout[(layout == 'keep_R2')].index.get_level_values('srx').unique().tolist()

In [27]:
pe = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': pe_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            's2': '$sra.run.read_count_r2',
            'l1': '$sra.run.read_len_r1',
            'l2': '$sra.run.read_len_r2',
        }
    }
])))

pe.set_index('_id', inplace=True)

pe_size = pe.s1.sum()
pe_bases = ((pe.l1 + pe.l2) * pe.s1).sum()

In [28]:
se = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': se_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

se.set_index('_id', inplace=True)

se_size = se.s1.sum()
se_bases = (se.l1 * se.s1).sum()

In [29]:
k1 = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': k1_srxs}
        }
    },
    {
        '$unwind': '$sra.run'
    },
    {
        '$project': {
            's1': '$sra.run.read_count_r1',
            'l1': '$sra.run.read_len_r1',
        }
    }
])))

k1.set_index('_id', inplace=True)

k1_size = k1.s1.sum()
k1_bases = (k1.l1 * k1.s1).sum()

In [30]:
total_size = pe_size + se_size + k1_size
print(f'{total_size / 1e9:,.0f} billion reads')

4 billion reads


In [31]:
total_bases = pe_bases + se_bases + k1_bases
print(f'{total_bases / 1e12:,} tera bases')

0.51784359414977 tera bases


In [32]:
pe_size

2052837076.0