In [1]:
import os
import sys
from pathlib import Path
from more_itertools import chunked

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dask.distributed import Client, as_completed
from dask import dataframe as dd
from dask import delayed

# Project level imports
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2019-02-23 
Git hash: f9a755c9ef13ad8d0af3d21d28cffe6e5451402f


In [3]:
# Start dask cluster
dask_client = Client()
dask_client

0,1
Client  Scheduler: tcp://127.0.0.1:41575  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 32  Cores: 32  Memory: 33.65 GB


In [4]:
def read_gene_counts(srx):
    return  pd.read_parquet(f'/home/fearjm/scratch/ncbi_remap/output/aln-wf/gene_counts/{srx}.parquet', columns=['count'])['count'].rename(srx)


def chunk_and_run(iterable, chunk_size=1_000):
    chunks = chunked(iterable, chunk_size)
    for chunk in chunks:
        futures = dask_client.map(read_gene_counts, chunk)
        yield pd.concat(dask_client.gather(futures), axis=1)
        
def build_table(iterable, chunk_size=1000):
    dfs = chunk_and_run(iterable, chunk_size)
    df = next(dfs)
    for _df in dfs:
        df = df.merge(_df, on='FBgn')
    return df

In [5]:
rnaseq = (
    pd.read_parquet('../output/metadata-wf/select_library_strategy.parquet')
    .rename(columns={'Fear_et_al_library_strategy': 'strategy'})
    .strategy
    .pipe(lambda x: x[x == 'RNA-Seq'])
    .index.unique().tolist()
)

In [6]:
df = build_table(rnaseq)
df.to_parquet('/home/fearjm/scratch/ncbi_remap/rnaseq_wide.parquet')

In [7]:
df.rank().to_parquet('/home/fearjm/scratch/ncbi_remap/rnaseq_wide_ranks.parquet')