FlyBase needs a table with number of mapped reads and the average read length for the golden set. They use these numbers in their normalization pipeline.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-05-05 
Git hash: d40ee230d57b6ca21eb5f009ae3fbbe19e777d8b


In [2]:
# connect to database
from pymongo import MongoClient
client = MongoClient(port=27022)
db = client['sra']
remap = db['remap']

In [3]:
q = list(remap.aggregate([
    {
        '$match': {
            'runs.aln_workflow.hisat2': {'$exists': 1}
        }
    },
    {'$unwind': '$runs'},
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
            'num_aln_reads': '$runs.aln_workflow.hisat2.num_reads_uniquely_aligned',
            'average_read_length': '$runs.aln_workflow.samtools_stats.average_length',
        }
    }
]))

df = pd.DataFrame(q)
df.set_index('srr', inplace=True)

In [4]:
df.sort_values('srx')

Unnamed: 0_level_0,average_read_length,num_aln_reads,srx
srr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR489286,50.0,11452417.0,ERX455041
ERR489288,50.0,21743747.0,ERX455042
ERR489289,50.0,15554350.0,ERX455048
SRR2103786,85.0,24631429.0,SRX1098298
SRR2103787,85.0,27083219.0,SRX1098299
SRR2103788,85.0,28729365.0,SRX1098300
SRR2103789,85.0,25874553.0,SRX1098301
SRR2103790,85.0,26296079.0,SRX1098302
SRR2103791,85.0,28740249.0,SRX1098303
SRR2103792,85.0,27008023.0,SRX1098304


In [5]:
grp = df.groupby('srx')
dfMean = grp.agg({'average_read_length': 'mean', 'num_aln_reads': 'sum'})

In [6]:
dfMean.sort_index()

Unnamed: 0_level_0,num_aln_reads,average_read_length
srx,Unnamed: 1_level_1,Unnamed: 2_level_1
ERX455041,11452417.0,50.0
ERX455042,21743747.0,50.0
ERX455048,15554350.0,50.0
SRX1098298,24631429.0,85.0
SRX1098299,27083219.0,85.0
SRX1098300,28729365.0,85.0
SRX1098301,25874553.0,85.0
SRX1098302,26296079.0,85.0
SRX1098303,28740249.0,85.0
SRX1098304,27008023.0,85.0


In [11]:
# these are the samples I have given to flybase as Wigs.
curr = ['ERX455041', 'ERX455042', 'ERX455048', 'SRX1098298', 'SRX1098299', 'SRX1098300', 
 'SRX1098301', 'SRX1098302', 'SRX1098303', 'SRX1098304', 'SRX1098305', 'SRX1098306', 
 'SRX1098307', 'SRX1098308', 'SRX1098309', 'SRX1331543', 'SRX1482989', 'SRX187085', 
 'SRX187086', 'SRX187087', 'SRX187088', 'SRX466999', 'SRX467000', 'SRX467001', 'SRX467002', 
 'SRX467003', 'SRX467004', 'SRX467005', 'SRX467006', 'SRX467007', 'SRX469997', 'SRX469998', 
 'SRX469999', 'SRX470000', 'SRX470001', 'SRX470002', 'SRX470004', 'SRX470005', 'SRX501031', 
 'SRX501033', 'SRX501034', 'SRX674739', 'SRX674743', 'SRX674744', 'SRX674754', 'SRX674758', 
 'SRX674760', 'SRX674761', 'SRX674762', 'SRX674769', 'SRX674774', 'SRX674775', 'SRX674776', 
 'SRX674777', 'SRX674779', 'SRX674781', 'SRX674788', 'SRX674793', 'SRX674798', 'SRX674801', 
 'SRX674802', 'SRX674803', 'SRX674820', 'SRX674823', 'SRX674826', 'SRX674844', 'SRX674873', 
 'SRX674880', 'SRX674881', 'SRX674943', 'SRX674966', 'SRX674975', 'SRX674990', 'SRX675020', 
 'SRX675023', 'SRX675025', 'SRX675027', 'SRX675045', 'SRX675058', 'SRX675086', 'SRX675100', 
 'SRX675167', 'SRX675247', 'SRX675265', 'SRX675267', 'SRX675282', 'SRX675284', 'SRX675285', 
 'SRX675314', 'SRX675315', 'SRX675325', 'SRX675357', 'SRX675376', 'SRX675379', 'SRX675390', 
 'SRX675398', 'SRX675409', 'SRX675414', 'SRX675432', 'SRX970009']
len(curr)

100

In [12]:
forFB = dfMean[dfMean.index.isin(curr)].copy()

In [13]:
forFB.shape

(100, 2)

In [15]:
forFB.to_csv('../../output/flybase_example_tracks_aln_summary.tsv', sep='\t', index_label='srx')