In [1]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dask.distributed import Client
from dask import delayed
import dask.dataframe as dd

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')
samples = store['aln/complete'].srx.unique().tolist()
store.close()

In [3]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sramongo']
ncbi = db['ncbi']

In [4]:
rnaseq = [x['_id'] for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': samples},
            'sra.experiment.library_strategy': 'RNA-Seq'
        }
    },
    {
        '$project': {
            '_id': 1
        }
    }
])]

In [5]:
len(rnaseq)

14423

In [6]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:39073  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 26.14 GB


In [7]:
@delayed
def read_parquet(fname):
    df = pd.read_parquet(fname)
    srx = df.srx.unique()[0]
    sr = df['count']
    sr.name = srx
    return sr

## Genic Counts

In [7]:
work = []
for srx in rnaseq[:5000]:
    work.append(read_parquet(f'../output/aln-wf/gene_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/gene_counts_1.tsv', sep='\t')

In [12]:
work = []
for srx in rnaseq[5000:10000]:
    work.append(read_parquet(f'../output/aln-wf/gene_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/gene_counts_2.tsv', sep='\t')

In [7]:
work = []
for srx in rnaseq[10000:]:
    work.append(read_parquet(f'../output/aln-wf/gene_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/gene_counts_3.tsv', sep='\t')

In [2]:
df = pd.concat([pd.read_csv('../geo-wf/gene_counts_1.tsv', sep='\t', index_col=0),
           pd.read_csv('../geo-wf/gene_counts_2.tsv', sep='\t', index_col=0),
           pd.read_csv('../geo-wf/gene_counts_3.tsv', sep='\t', index_col=0)], axis=1)

df.to_csv('/media/psf/Promise_Pegasus/fearjm/ncbi_remap/geo-wf/justin.fear@nih.gov/gene_counts.tsv', sep='\t')

## Intergenic Counts

In [8]:
work = []
for srx in rnaseq[:5000]:
    work.append(read_parquet(f'../output/aln-wf/intergenic_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/intergenic_counts_1.tsv', sep='\t')

In [9]:
work = []
for srx in rnaseq[5000:10000]:
    work.append(read_parquet(f'../output/aln-wf/intergenic_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/intergenic_counts_2.tsv', sep='\t')

In [10]:
work = []
for srx in rnaseq[10000:]:
    work.append(read_parquet(f'../output/aln-wf/intergenic_counts/{srx}.parquet'))
futures = client.compute(work)
df = pd.concat(client.gather(futures), axis=1)
df.to_csv('../geo-wf/intergenic_counts_3.tsv', sep='\t')

In [3]:
df = pd.concat([pd.read_csv('../geo-wf/intergenic_counts_1.tsv', sep='\t', index_col=0),
           pd.read_csv('../geo-wf/intergenic_counts_2.tsv', sep='\t', index_col=0),
           pd.read_csv('../geo-wf/intergenic_counts_3.tsv', sep='\t', index_col=0)], axis=1)

df.to_csv('/media/psf/Promise_Pegasus/fearjm/ncbi_remap/geo-wf/justin.fear@nih.gov/intergenic_counts.tsv', sep='\t')