In [2]:
# %load ../config/defaults.py
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [4]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-10-11 
Git hash: c5425d3c436714fc2475c8e5525cbd28738fd3eb


In [5]:
# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

In [9]:
bob = np.array([1_000, 100_000, 1_000_000, 0])

In [11]:
(bob - np.min(bob)) / (np.max(bob) - np.min(bob))

array([ 0.001,  0.1  ,  1.   ,  0.   ])

In [6]:
sra = pd.read_parquet('../metadata-wf/output/munge_library_strategy_from_mongo.parquet')
sra.columns = ['sra']

author = pd.read_parquet('../metadata-wf/output/free_text_library_strategy.parquet')
author.columns = ['author']

data = pd.read_parquet('../metadata-wf/output/random_forest_library_strategy.parquet')
data = data.reset_index().melt(id_vars='srx').groupby('srx').value.value_counts().unstack().idxmax(axis=1).to_frame()
data.columns = ['data']

other = pd.read_parquet('../metadata-wf/output/random_forest_library_strategy_other.parquet')
other = other.reset_index().melt(id_vars='srx').groupby('srx').value.value_counts().unstack().idxmax(axis=1).to_frame()
other.columns = ['data']
data = pd.concat([data, other])

In [7]:
libstrat = pd.concat([sra, author, data], sort=True, axis=1)

# remove samples with no data, these have not completed the workflow
libstrat = libstrat[~libstrat.data.isnull()]

In [8]:
author_same = (libstrat.sra == libstrat.author)
data_same = (libstrat.sra == libstrat.data)
all_same =  author_same & data_same

In [114]:
# sra, author, and data all have same annotation
_all = libstrat[all_same].sra
_all.name = 'library_strategy'
_all.value_counts()

RNA-Seq      11785
ChIP-Seq      2381
WGS           1107
AMPLICON        80
MNase-Seq       10
miRNA-Seq        5
ncRNA-Seq        2
Name: library_strategy, dtype: int64

In [113]:
# These samples are same between sra vs data, and there is no author info
_data = libstrat[data_same & libstrat.author.isnull()].sra
_data.name = 'library_strategy'
_data.value_counts()

RNA-Seq      503
WGS          213
ChIP-Seq      55
miRNA-Seq     39
RIP-Seq        2
AMPLICON       1
EST            1
Name: library_strategy, dtype: int64

In [118]:
# these have non-conflicting support from multiple places
high_confidence = pd.concat([_all, _data])

In [120]:
high_confidence.value_counts()

RNA-Seq      12288
ChIP-Seq      2436
WGS           1320
AMPLICON        81
miRNA-Seq       44
MNase-Seq       10
RIP-Seq          2
ncRNA-Seq        2
EST              1
Name: library_strategy, dtype: int64

In [123]:
# here the model predicts something different
_model = libstrat[author_same & ~all_same]
_model.sra.value_counts()

WGS          79
MNase-Seq    43
ChIP-Seq     28
ncRNA-Seq    14
RNA-Seq       8
miRNA-Seq     8
Name: sra, dtype: int64

In [127]:
# here the authors say something different
_author = libstrat[data_same & ~all_same & ~libstrat.author.isnull()]
_author.sra.value_counts()

EST                 4302
RNA-Seq             2122
WGS                  517
ChIP-Seq             498
Targeted-Capture      93
miRNA-Seq             63
ncRNA-Seq             25
MNase-Seq             19
RIP-Seq                3
Name: sra, dtype: int64

In [129]:
_author.author.value_counts()

3Prime-Seq|CLONE|DNA-Seq|Kilo-Seq                   4158
END-Seq|RNA-Seq                                      580
ChIP-Seq|RNA-Seq                                     568
END-Seq                                              311
3Prime-Seq|RNA-Seq                                   259
3Prime-Seq|WGS                                       212
RNA-Seq|mmPCR-Seq                                    113
RNA-Seq                                               78
END-Seq|Pool-Seq                                      77
END-Seq|WGS                                           77
WGS                                                   64
3Prime-Seq|CLONE|EST|RNA-Seq                          58
TADA-Seq                                              56
Pool-Seq                                              52
ChIP-Seq|MNase-Seq                                    51
ATAC-Seq|ChIP-Seq|RNA-Seq                             45
ChIP-Seq|RNA-Seq|WGS                                  38
mmPCR-Seq                      

In [135]:
libstrat[libstrat.sra == 'OTHER'].author.value_counts()

4C-Seq                                                       721
mmPCR-Seq                                                    266
3Prime-Seq                                                   245
RNA-Seq                                                      140
ATAC-Seq|END-Seq                                              78
4C-Seq|HiC-Seq                                                78
DNA-Seq                                                       62
DNA-Seq|WGS                                                   60
RAD-Seq|WGS                                                   55
WGS                                                           39
STARR-Seq                                                     39
PAL-Seq|RNA-Seq                                               25
Pool-Seq                                                      23
CLIP-Seq|RNA-Seq                                              22
Repli-Seq                                                     22
HiC-Seq                  

In [136]:
libstrat[libstrat.sra == 'OTHER'].data.value_counts()

RNA-Seq     1362
ChIP-Seq     926
WGS          226
EST           58
Name: data, dtype: int64

In [35]:
# remove samples I don't have data driven classes. 
# This includes OTHER and low freq classes.
libstrat = libstrat[~libstrat.data.isna()]

In [55]:
author_mask = (libstrat.author.isnull() | (libstrat.sra == libstrat.author))

In [60]:
libstrat[libstrat.author.isnull()]

Unnamed: 0,sra,author,data
DRX015073,WGS,,WGS
DRX015074,WGS,,WGS
DRX015075,WGS,,WGS
DRX015076,WGS,,WGS
DRX027351,RNA-Seq,,RNA-Seq
DRX027352,RNA-Seq,,RNA-Seq
DRX027353,RNA-Seq,,RNA-Seq
DRX027354,RNA-Seq,,RNA-Seq
DRX027355,RNA-Seq,,RNA-Seq
DRX027356,RNA-Seq,,RNA-Seq


In [49]:
all_the_same = libstrat[author_mask & (libstrat.sra == libstrat.data)]

In [50]:
all_the_same

Unnamed: 0,sra,author,data
DRX013093,RNA-Seq,RNA-Seq,RNA-Seq
DRX013094,RNA-Seq,RNA-Seq,RNA-Seq
DRX014765,RNA-Seq,RNA-Seq,RNA-Seq
DRX014766,RNA-Seq,RNA-Seq,RNA-Seq
DRX014767,RNA-Seq,RNA-Seq,RNA-Seq
DRX014768,RNA-Seq,RNA-Seq,RNA-Seq
DRX014769,RNA-Seq,RNA-Seq,RNA-Seq
DRX014770,RNA-Seq,RNA-Seq,RNA-Seq
DRX014771,RNA-Seq,RNA-Seq,RNA-Seq
DRX014772,RNA-Seq,RNA-Seq,RNA-Seq


In [39]:
focus = libstrat[~(libstrat.sra == libstrat.author) & (libstrat.sra == libstrat.data)]

In [46]:
focus.author.isnull()

DRX015073     True
DRX015074     True
DRX015075     True
DRX015076     True
DRX027351     True
DRX027352     True
DRX027353     True
DRX027354     True
DRX027355     True
DRX027356     True
DRX027357     True
DRX027358     True
DRX027359     True
DRX027360     True
DRX027361     True
DRX027362     True
DRX027363     True
DRX027364     True
DRX027365     True
DRX027366     True
DRX027367     True
DRX027368     True
DRX027369     True
DRX027370     True
DRX027371     True
DRX027372     True
DRX027373     True
DRX027374     True
DRX042143    False
DRX042144    False
             ...  
SRX978778    False
SRX978779    False
SRX978780    False
SRX978781    False
SRX978782    False
SRX978783    False
SRX978784    False
SRX978785    False
SRX978786    False
SRX978787    False
SRX978788    False
SRX978789    False
SRX978790    False
SRX978791    False
SRX978792    False
SRX978793    False
SRX978794    False
SRX978795    False
SRX978796    False
SRX978797    False
SRX978798    False
SRX978799   