Skip to content

Commit

Permalink
Fix consistency in array shape output when track indexing on bigWig f…
Browse files Browse the repository at this point in the history
…iles (#69)

* Fix consistency in array shape output when track indexcing on bigWig files
  • Loading branch information
EricR86 committed Nov 1, 2023
1 parent a95cb3a commit 020d7e9
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 23 deletions.
1 change: 1 addition & 0 deletions NEWS
@@ -1,5 +1,6 @@
1.7.2:
* required Python is now >=3.9
* fixed consistency in array shape output when track indexing on bigWig files

1.7.1:
* fix array dimensionality consistency for summary statistics on bigWig files
Expand Down
1 change: 1 addition & 0 deletions doc/genomedata.rst
Expand Up @@ -374,6 +374,7 @@ format and the bigWig file format:

- There is only one track per bigWig file and it is implicitly set to the
filename of the bigWig.
- Track indexing is only used to shape dimensionality of output.
- Summary statistics are taken from the bigWig file formation definition
which are stored as integers. There may be some differences precision.
- Each :class:`Chromosome <Chromosome>` is represented with 1 underlying
Expand Down
37 changes: 33 additions & 4 deletions genomedata/_bigwig.py
@@ -1,6 +1,6 @@
import struct

from numpy import array
from numpy import array, ndarray
import pyBigWig

from ._chromosome import (Chromosome, CONTINUOUS_DTYPE, _ChromosomeList,
Expand Down Expand Up @@ -141,8 +141,12 @@ def __getitem__(self, key):
"""Same functionality as parent Chromosome without second key
specifying a track or track range"""

# Get the chromosomal base key and track key
# Effectively ignore any track key to maintain a consistent interface
base_key, track_key = self._get_base_and_track_key(key)

# Convert key to valid chromosomal indexing coordinates
start, end = _key_to_tuple(key)
start, end = _key_to_tuple(base_key)

range_length = end - start

Expand All @@ -151,8 +155,33 @@ def __getitem__(self, key):
# Return an empty numpy array
return array([], dtype=CONTINUOUS_DTYPE)

# Otherwise return the data in range
return self.bw_file.values(self.name, start, end, numpy=True)
# Get the data in range
data = self.bw_file.values(self.name, start, end, numpy=True)

# Expected shape output
# c[i, j] == data
# c[i] == [data] <-- current shape
# c[i:j, k] = [data] <-- current shape
# c[i:j, [k]] = [[data]]

# If directly indexing a base (no slice)
# and there is any track index specified
if (isinstance(base_key, int) and
track_key != slice(None)):
# Return a scalar value (no array)
data = data[0]
# If indexing based on track is a list-type
elif isinstance(track_key, (list, ndarray)):
# Shape the data depending on track indexing
# (compatability with PyTables/HDF5)
data = data.reshape((range_length, 1))

# NB: When track_key is a list that contains non-integers,
# there is no behaviour defined and is not numpy-like (will error)
# This is equivalent to the PyTables/HDF5 implementation
data = data[:, array(track_key)]

return data

@property
def seq(self):
Expand Down
52 changes: 33 additions & 19 deletions test/run_tests.py
Expand Up @@ -2,13 +2,12 @@

from __future__ import absolute_import, division, print_function

from math import isnan
import os
from os import chdir, remove
import subprocess
import unittest

from numpy import nan
import numpy as np
from path import Path

from genomedata._load_seq import load_seq
Expand All @@ -17,7 +16,6 @@
from genomedata import Genome

import test_genomedata
from six.moves import range

"""
DESCRIPTION
Expand Down Expand Up @@ -321,9 +319,32 @@ def test_interface(self):
# chr1 569801 569829 0.02215

self.assertAlmostEqualList(
chr1[569798:569802],
[nan, nan, 0.01108, 0.02215],
places=5 # NB: Number of original input decimal places
chr1[569798:569802, "foo.track"], # should ignore tracknames
[np.nan, np.nan, 0.01108, 0.02215],
)

# Single element should result in a single value
# Expected shape output
# c[i, j] == data
# c[i] == [data]
# c[i:j, k] = [data]
# c[i:j, [k]] = [[data]]
single_val = chr1[569800, 0]
self.assertAlmostEqual(single_val, 0.01108, places=5)
self.assertFalse(isinstance(single_val, np.ndarray))

single_val_array = chr1[569800]
self.assertTrue(isinstance(single_val_array, np.ndarray))
self.assertTrue(single_val_array.shape == (1,))

single_val_array = chr1[569800:569801, 0]
self.assertTrue(isinstance(single_val_array, np.ndarray))
self.assertTrue(single_val_array.shape == (1,))

# Track index shape should shape data output
self.assertAlmostEqualList(
np.array([[np.nan], [0.01108], [0.02215]]),
chr1[569799:569802, [0]]
)

# Test the trackname is implictly the filename
Expand All @@ -344,24 +365,17 @@ def test_interface(self):

# Slice indexing
self.assertAlmostEqualList(supercontig.continuous[1:2],
[0.02215, 0.02215],
places=5)
[0.02215, 0.02215])
# Single indexing
self.assertAlmostEqualList(supercontig.continuous[0], [0.01108],
places=5)
self.assertAlmostEqualList(supercontig.continuous[0], [0.01108])

def assertAlmostEqualList(self, first, second, places=1):
""" Compares floating point numbers in a list based on decimal places.
""" Compares floating point numbers in a list if close enough.
Does not apply to nans """

for first_elem, second_elem in zip(first, second):
# NB: NaN values are never considered equal
is_first_nan = isnan(first_elem)
is_second_nan = isnan(second_elem)
if is_first_nan or is_second_nan:
self.assertTrue(is_first_nan and is_second_nan)
else:
self.assertAlmostEqual(first_elem, second_elem, places)
self.assertTrue(
np.allclose(first, second, equal_nan=True)
)


def main():
Expand Down

0 comments on commit 020d7e9

Please sign in to comment.