In [10]:
#load  tutorial utilities 
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Ingesting data into tileDB 

In [11]:
from seqdataloader.dbingest import * 

The header of the input task file should contain (one or more) of the following fields: 
    * dataset (this one's required -- it's a unique label for your dataset) 
    * pval_bigwig 
    * fc_bigwig 
    * count_bigwig_plus_5p 
    * count_bigwig_minux_5p
    * idr_peak
    * overlap_peak 
    * ambig_peak 
    
The file paths can be either local or web-based URL's. 

In [12]:
!cat tasks.dbingest.tsv

dataset	idr_peak	fc_bigwig	ambig_peak
ENCFF209DJG	https://www.encodeproject.org/files/ENCFF209DJG/@@download/ENCFF209DJG.bed.gz	https://www.encodeproject.org/files/ENCFF842XRQ/@@download/ENCFF842XRQ.bigWig	hg38.blacklist.bed.gz


In [13]:
!cat tasks.dbingest.local.tsv

dataset	idr_peak	fc_bigwig	ambig_peak
ENCFF209DJG	ENCFF209DJG.bed.gz	ENCFF842XRQ.bigWig	hg38.blacklist.bed.gz


### run as script

In [15]:
## multi-threaded version is faster, but more qc is needed to determine an edge case where a lot of memory gets consumed 
!db_ingest --tiledb_metadata tasks.dbingest.local.tsv \
    --tiledb_group hepg2_dnase_encode \
    --overwrite \
    --chrom_sizes hg38.chr10.size \
    --chrom_threads 1 \
    --task_threads 1 \
    --write_threads 30 \
    --attribute_config encode_pipeline \
    --tile_size 9000 \
    --batch_size 1000000


loaded tiledb metadata
loaded chrom sizes
tiledb group already exists
parsed pool inputs
made pool!
store_summits:True
summit_indicator:2
got:idr_peak for chrom:chr10
store_summits:False
summit_indicator:None
13.610400199890137
got:fc_bigwig for chrom:chr10
store_summits:False
summit_indicator:None
got:ambig_peak for chrom:chr10
starting to write output
got cur vals
dict_to_write[key].shape:(133797422,)
dict_to_write[key].shape:(133797422,)
dict_to_write[key].shape:(133797422,)
updated data dict for writing
finalizing the write
length of pool inputs:134
made pool
start:0, end:1000000
start:1000000, end:2000000
start:2000000, end:3000000
start:3000000, end:4000000
start:4000000, end:5000000
start:5000000, end:6000000
start:6000000, end:7000000
start:7000000, end:8000000
start:8000000, end:9000000
start:9000000, end:10000000
start:10000000, end:11000000
start:11000000, end:12000000
start:12000000, end:13000000
start:13000000, end:14000000
start:14000000, end:15000000
start:15000000, end:

done with chunk start:100000000, end:101000000
start:119000000, end:120000000
done with chunk start:9000000, end:10000000
done
done
done
done
done
done
done
done
done
done
done
start:120000000, end:121000000
done with chunk start:102000000, end:103000000
start:121000000, end:122000000
done with chunk start:103000000, end:104000000
start:122000000, end:123000000
done with chunk start:104000000, end:105000000
start:123000000, end:124000000
done with chunk start:105000000, end:106000000
start:124000000, end:125000000
done with chunk start:106000000, end:107000000
start:125000000, end:126000000
done with chunk start:107000000, end:108000000
start:126000000, end:127000000
done with chunk start:108000000, end:109000000
start:127000000, end:128000000
done with chunk start:109000000, end:110000000
start:128000000, end:129000000
done with chunk start:23000000, end:24000000
start:129000000, end:130000000
done with chunk start:111000000, end:112000000
start:130000000, end:131000000
done with chun

In [None]:
## single-threaded version utilizes tiledb internal parallel writes, but not additional parallelization across chroms/tasks
!db_ingest_single_threaded --tiledb_metadata tasks.dbingest.local.tsv \
    --tiledb_group hepg2_dnase_encode_single_threaded \
    --overwrite \
    --chrom_sizes hg38.chr10.size \
    --attribute_config encode_pipeline \
    --tile_size 9000


You can run the ingest code as a python function: 

In [None]:
args={"tiledb_metadata":"tasks.dbingest.local.tsv",
      "tiledb_group":"hepg2_dnase_encode",
      "overwrite":True,
      "chrom_sizes":"hg38.chr10.size",
      "chrom_threads":1,
      "task_threads":1,
      "write_threads":1}

ingest(args)

## Reading data from tiledb

In [25]:
chrom="chr10"
chromsize=133797422


In [26]:
#we can examine the array 
import tiledb 
data=tiledb.DenseArray("hepg2_dnase_encode/ENCFF209DJG.chr10",'r')


In [27]:
%%time
subset=data[0:chromsize]

CPU times: user 35.4 s, sys: 28.3 s, total: 1min 3s
Wall time: 5.17 s


In [28]:
print(subset.keys())

odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minus_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])


In [29]:
import pyBigWig
import numpy as np 
data=pyBigWig.open("ENCFF842XRQ.bigWig")


In [30]:
%%time
signal_data=np.nan_to_num(data.values(chrom,0,chromsize))


CPU times: user 10.8 s, sys: 4.25 s, total: 15 s
Wall time: 14.9 s


## Integration with batch producers 

In [24]:
#unit tests for class seqdataloader.batchproducers.coordbased.coordstovals.BasicTiledbProfileCoordsToVals
from seqdataloader.batchproducers.coordbased.coordstovals.tiledb import *



Using TensorFlow backend.


In [31]:
#generate some test coords objects 
from collections import namedtuple
Coord=namedtuple('Coord','chrom start end isplusstrand')
coords=[Coord('chr1',1000000,2000000,True),
        Coord('chr2',1000000,2000000,True),
        Coord('chr3',1000000,2000000,True),
        Coord('chr4',1000000,2000000,True),
        Coord('chr5',1000000,2000000,True),
        Coord('chr6',1000000,2000000,True),
        Coord('chr7',1000000,2000000,True),
        Coord('chr1',1000000,2000000,False),
        Coord('chr2',1000000,2000000,False),
        Coord('chr3',1000000,2000000,False),
        Coord('chr4',1000000,2000000,False),
        Coord('chr5',1000000,2000000,False),
        Coord('chr6',1000000,2000000,False),
        Coord('chr7',1000000,2000000,False)]


pos_label_source_attribute="fc_bigwig"
neg_label_source_attribute="fc_bigwig"



In [32]:

#case 1: tiledb_paths is a string
tiledb_paths="/mnt/data/tiledb/ENCSR000EOY"
ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
                                    pos_label_source_attribute=pos_label_source_attribute,
                                    neg_label_source_attribute=neg_label_source_attribute)
string_vals=ctov.__call__(coords)
string_vals.shape


TileDBError: [TileDB::StorageManager] Error: Cannot open array; Array does not exist

In [None]:
coords=[Coord('chr1',1000,2000,True),
        Coord('chr2',1000,2000,True),
        Coord('chr3',1000,2000,True),
        Coord('chr4',1000,2000,True),
        Coord('chr5',1000,2000,True),
        Coord('chr6',1000,2000,True),
        Coord('chr7',1000,2000,True),
        Coord('chr1',1000,2000,False),
        Coord('chr2',1000,2000,False),
        Coord('chr3',1000,2000,False),
        Coord('chr4',1000,2000,False),
        Coord('chr5',1000,2000,False),
        Coord('chr6',1000,2000,False),
        Coord('chr7',1000,2000,False)]

In [None]:
string_vals=ctov.__call__(coords)

In [None]:
#case2: tiledb_paths is a list
tiledb_paths=["/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY","/mnt/data/tiledb/encode/dnase/ENCSR000EOY"]
ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
                                    pos_label_source_attribute=pos_label_source_attribute,
                                    neg_label_source_attribute=neg_label_source_attribute)
list_vals=ctov.__call__(coords)
list_vals

In [None]:
#case3: tiledb_paths is a dict
tiledb_paths={'mode0':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY",
              'mode1':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY",
              'mode2':"/mnt/data/tiledb/encode/dnase/ENCSR000EOY"}

ctov=BasicTiledbProfileCoordsToVals(tiledb_paths=tiledb_paths,
                                    pos_label_source_attribute=pos_label_source_attribute,
                                    neg_label_source_attribute=neg_label_source_attribute)
dict_vals=ctov.__call__(coords)
dict_vals
