In [1]:
import pandas as pd 
import numpy as np
import h5py
import tiledb 
import random 
import s3fs
import pyBigWig
random.seed(1234)

In [2]:
attribute_of_interest='fc_bigwig'
batch_size=100000
vector_length=1000
task="ENCSR000EID"
chrom="chr1"
chromsize=248956422 - 50000000

In [3]:
## random batch of data -- 100 genomic regions of 1kb each 
regions=[] 
for batch_entry in range(batch_size): 
    cur_start=random.randint(vector_length,chromsize-vector_length)
    regions.append([chrom,cur_start,cur_start+vector_length])
print(regions[0:10])

[['chr1', 118309257, 118310257], ['chr1', 31367822, 31368822], ['chr1', 2011977, 2012977], ['chr1', 24332956, 24333956], ['chr1', 156291525, 156292525], ['chr1', 9376421, 9377421], ['chr1', 180272449, 180273449], ['chr1', 186050237, 186051237], ['chr1', 22532996, 22533996], ['chr1', 26420966, 26421966]]


In [4]:
## Tiledb Test 1: open tiledb array 
tdb_array=tiledb.open('.'.join([task,chrom]),'r',ctx=tiledb.Ctx())

In [5]:
##Tiledb Test 2: extract values for a batch of data 
tdb_indices=[slice(i[1],i[2]-1) for i in regions]
batch_tdb=tdb_array.query(attrs=[attribute_of_interest]).multi_index[tdb_indices][attribute_of_interest]
batch_tdb=np.reshape(batch_tdb,(batch_size,-1))


In [6]:
## HDF5 Test 1: # open hdf5 file for reading 
hdf5_local=h5py.File(task+"."+chrom+".hdf5",mode='r')

In [7]:
## HDF5 Test 2: read regions for task ENCSR000EID from a local hdf5 file 
batch_hdf5=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_hdf5[region_index,:]=hdf5_local['data'][region[1]:region[2]]
    region_index+=1

In [8]:
## pyBigWig  Test 1: open BigWig for reading 
bigwig_local=pyBigWig.open("ENCSR000EID.merged.nodup.fc.signal.bigwig",'r')

In [9]:
## pyBigWig  Test 2: read regions for task ENCSR000EID from a local BigWig
batch_bw=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_bw[region_index,:]=bigwig_local.values(region[0],region[1],region[2],numpy=True)
    region_index+=1

In [10]:
# numpy Test 1: extract chr1 from a bigwig and save it

bigwig_local=pyBigWig.open("ENCSR000EID.merged.nodup.fc.signal.bigwig",'r')
signal = bigwig_local.values('chr1', 0, -1, numpy=True)
np.save("ENCSR000EID.chr1.npy", signal)

In [11]:
# numpy Test 2: read regions for the task from a memory mapped numpy array

numpy_local = np.load("ENCSR000EID.chr1.npy", mmap_mode='r')

batch_npy=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_npy[region_index]=numpy_local[region[1]:region[2]]
    region_index+=1

In [12]:
# numpy Test 3: read regions 

batch_npy2=np.full((batch_size,vector_length),np.nan)
region_index=0
for region in regions:
    batch_npy2[region_index]=signal[region[1]:region[2]]
    region_index+=1

In [13]:
batch_tdb.sum(), batch_hdf5.sum(), batch_bw.sum(), batch_npy.sum(), batch_npy2.sum()

(45326624.0,
 45326639.46270242,
 45326639.46270242,
 45326639.46270242,
 45326639.46270242)

In [14]:
#make sure we're getting the same batch each time
assert sum(sum(batch_tdb==batch_hdf5))==100000
assert sum(sum(batch_tdb==batch_bw))==100000
assert sum(sum(batch_hdf5==batch_bw))==100000
           

AssertionError: 