Fix consistency in array shape output when track indexing on bigWig f…

…iles (#69) * Fix consistency in array shape output when track indexcing on bigWig files
hoffmangroup · Nov 1, 2023 · 020d7e9 · 020d7e9
1 parent a95cb3a
commit 020d7e9
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 23 deletions.
diff --git a/NEWS b/NEWS
@@ -1,5 +1,6 @@
 1.7.2:
 * required Python is now >=3.9
+* fixed consistency in array shape output when track indexing on bigWig files
 
 1.7.1:
 * fix array dimensionality consistency for summary statistics on bigWig files

diff --git a/doc/genomedata.rst b/doc/genomedata.rst
@@ -374,6 +374,7 @@ format and the bigWig file format:
 
 - There is only one track per bigWig file and it is implicitly set to the
   filename of the bigWig.
+- Track indexing is only used to shape dimensionality of output.
 - Summary statistics are taken from the bigWig file formation definition
   which are stored as integers. There may be some differences precision.
 - Each :class:`Chromosome <Chromosome>` is represented with 1 underlying

diff --git a/genomedata/_bigwig.py b/genomedata/_bigwig.py
@@ -1,6 +1,6 @@
 import struct
 
-from numpy import array
+from numpy import array, ndarray
 import pyBigWig
 
 from ._chromosome import (Chromosome, CONTINUOUS_DTYPE, _ChromosomeList,
@@ -141,8 +141,12 @@ def __getitem__(self, key):
         """Same functionality as parent Chromosome without second key
         specifying a track or track range"""
 
+        # Get the chromosomal base key and track key
+        # Effectively ignore any track key to maintain a consistent interface
+        base_key, track_key = self._get_base_and_track_key(key)
+
         # Convert key to valid chromosomal indexing coordinates
-        start, end = _key_to_tuple(key)
+        start, end = _key_to_tuple(base_key)
 
         range_length = end - start
 
@@ -151,8 +155,33 @@ def __getitem__(self, key):
             # Return an empty numpy array
             return array([], dtype=CONTINUOUS_DTYPE)
 
-        # Otherwise return the data in range
-        return self.bw_file.values(self.name, start, end, numpy=True)
+        # Get the data in range
+        data = self.bw_file.values(self.name, start, end, numpy=True)
+
+        # Expected shape output
+        # c[i, j] == data
+        # c[i] == [data] <-- current shape
+        # c[i:j, k] = [data] <-- current shape
+        # c[i:j, [k]] = [[data]]
+
+        # If directly indexing a base (no slice)
+        # and there is any track index specified
+        if (isinstance(base_key, int) and
+           track_key != slice(None)):
+            # Return a scalar value (no array)
+            data = data[0]
+        # If indexing based on track is a list-type
+        elif isinstance(track_key, (list, ndarray)):
+            # Shape the data depending on track indexing
+            # (compatability with PyTables/HDF5)
+            data = data.reshape((range_length, 1))
+
+            # NB: When track_key is a list that contains non-integers,
+            # there is no behaviour defined and is not numpy-like (will error)
+            # This is equivalent to the PyTables/HDF5 implementation
+            data = data[:, array(track_key)]
+
+        return data
 
     @property
     def seq(self):

diff --git a/test/run_tests.py b/test/run_tests.py
@@ -2,13 +2,12 @@
 
 from __future__ import absolute_import, division, print_function
 
-from math import isnan
 import os
 from os import chdir, remove
 import subprocess
 import unittest
 
-from numpy import nan
+import numpy as np
 from path import Path
 
 from genomedata._load_seq import load_seq
@@ -17,7 +16,6 @@
 from genomedata import Genome
 
 import test_genomedata
-from six.moves import range
 
 """
 DESCRIPTION
@@ -321,9 +319,32 @@ def test_interface(self):
             # chr1	569801	569829	0.02215
 
             self.assertAlmostEqualList(
-                chr1[569798:569802],
-                [nan, nan, 0.01108, 0.02215],
-                places=5  # NB: Number of original input decimal places
+                chr1[569798:569802, "foo.track"],  # should ignore tracknames
+                [np.nan, np.nan, 0.01108, 0.02215],
+            )
+
+            # Single element should result in a single value
+            # Expected shape output
+            # c[i, j] == data
+            # c[i] == [data]
+            # c[i:j, k] = [data]
+            # c[i:j, [k]] = [[data]]
+            single_val = chr1[569800, 0]
+            self.assertAlmostEqual(single_val, 0.01108, places=5)
+            self.assertFalse(isinstance(single_val, np.ndarray))
+
+            single_val_array = chr1[569800]
+            self.assertTrue(isinstance(single_val_array, np.ndarray))
+            self.assertTrue(single_val_array.shape == (1,))
+
+            single_val_array = chr1[569800:569801, 0]
+            self.assertTrue(isinstance(single_val_array, np.ndarray))
+            self.assertTrue(single_val_array.shape == (1,))
+
+            # Track index shape should shape data output
+            self.assertAlmostEqualList(
+                np.array([[np.nan], [0.01108], [0.02215]]),
+                chr1[569799:569802, [0]]
             )
 
             # Test the trackname is implictly the filename
@@ -344,24 +365,17 @@ def test_interface(self):
 
             # Slice indexing
             self.assertAlmostEqualList(supercontig.continuous[1:2],
-                                       [0.02215, 0.02215],
-                                       places=5)
+                                       [0.02215, 0.02215])
             # Single indexing
-            self.assertAlmostEqualList(supercontig.continuous[0], [0.01108],
-                                       places=5)
+            self.assertAlmostEqualList(supercontig.continuous[0], [0.01108])
 
     def assertAlmostEqualList(self, first, second, places=1):
-        """ Compares floating point numbers in a list based on decimal places.
+        """ Compares floating point numbers in a list if close enough.
         Does not apply to nans """
 
-        for first_elem, second_elem in zip(first, second):
-            # NB: NaN values are never considered equal
-            is_first_nan = isnan(first_elem)
-            is_second_nan = isnan(second_elem)
-            if is_first_nan or is_second_nan:
-                self.assertTrue(is_first_nan and is_second_nan)
-            else:
-                self.assertAlmostEqual(first_elem, second_elem, places)
+        self.assertTrue(
+            np.allclose(first, second, equal_nan=True)
+        )
 
 
 def main():