In [1]:
import pandas as pd 
import numpy as np
import h5py
import tiledb 
import random 
import s3fs
import pyBigWig
random.seed(1234)

In [27]:
attribute_of_interest='fc_bigwig'
batch_size=100
vector_length=1000
task="ENCSR000EID"
chrom="chr1"
chromsize=248956422

In [28]:
## random batch of data -- 100 genomic regions of 1kb each 
regions=[] 
for batch_entry in range(batch_size): 
    cur_start=random.randint(0,chromsize-vector_length)
    regions.append([chrom,cur_start,cur_start+vector_length])
print(regions[0:10])

[['chr1', 106928163, 106929163], ['chr1', 14092580, 14093580], ['chr1', 93825126, 93826126], ['chr1', 235533422, 235534422], ['chr1', 224012078, 224013078], ['chr1', 118260606, 118261606], ['chr1', 62410463, 62411463], ['chr1', 208544026, 208545026], ['chr1', 131667959, 131668959], ['chr1', 44377938, 44378938]]


In [29]:
## Tiledb Test 1: open tiledb array 
tdb_array=tiledb.open('.'.join([task,chrom]),'r',ctx=tiledb.Ctx())

In [30]:
##Tiledb Test 2: extract values for a batch of data 
tdb_indices=[slice(i[1],i[2]-1) for i in regions]
batch_tdb=tdb_array.query(attrs=[attribute_of_interest]).multi_index[tdb_indices][attribute_of_interest]
batch_tdb=np.reshape(batch_tdb,(batch_size,-1))


In [31]:
## HDF5 Test 1: # open hdf5 file for reading 
hdf5_local=h5py.File(task+"."+chrom+".hdf5",mode='r')

In [32]:
## HDF5 Test 2: read regions for task ENCSR000EID from a local hdf5 file 
batch_hdf5=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_hdf5[region_index,:]=hdf5_local['data'][region[1]:region[2]]
    region_index+=1

In [33]:
## pyBigWig  Test 1: open BigWig for reading 
bigwig_local=pyBigWig.open("ENCSR000EID.merged.nodup.fc.signal.bigwig",'r')

In [34]:
## pyBigWig  Test 2: read regions for task ENCSR000EID from a local BigWig
batch_bw=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_bw[region_index,:]=bigwig_local.values(region[0],region[1],region[2],numpy=True)
    region_index+=1

In [35]:
#make sure we're getting the same batch each time
assert sum(sum(batch_tdb==batch_hdf5))==100000
assert sum(sum(batch_tdb==batch_bw))==100000
assert sum(sum(batch_hdf5==batch_bw))==100000
           