Merge pull request #110 from ilastik/faster-hdf5

Speed up HDF5 processing
ilastik · Aug 24, 2023 · 08c603e · 08c603e
2 parents c7f982c + 5737f05
commit 08c603e
Show file tree

Hide file tree

Showing 18 changed files with 1,535 additions and 944 deletions.
diff --git a/pom.xml b/pom.xml
@@ -1,12 +1,13 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
 
     <parent>
         <groupId>org.scijava</groupId>
         <artifactId>pom-scijava</artifactId>
         <version>36.0.0</version>
-        <relativePath />
+        <relativePath/>
     </parent>
 
     <groupId>org.ilastik</groupId>
@@ -24,7 +25,7 @@
     <licenses>
         <license>
             <name>mit</name>
-            <url />
+            <url/>
             <distribution>repo</distribution>
         </license>
     </licenses>
@@ -33,12 +34,12 @@
         <developer>
             <id>chaubold</id>
             <name>Carsten Haubold</name>
-            <url />
+            <url/>
         </developer>
         <developer>
             <id>awolny</id>
             <name>Adrian Wolny</name>
-            <url />
+            <url/>
         </developer>
 
     </developers>
@@ -171,5 +172,21 @@
             <artifactId>junit</artifactId>
             <scope>test</scope>
         </dependency>
+        <!-- JUnit 5 -->
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-api</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-engine</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-params</artifactId>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 </project>
diff --git a/scripts/generate-dataset.py b/scripts/generate-dataset.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+"""Generate HDF5 dataset of the specified size filled with zeros."""
+
+import argparse
+import json
+
+import h5py
+import numpy
+
+
+def parse_shape(s: str) -> tuple[int, ...]:
+    return tuple(map(int, reversed(s.split(","))))
+
+
+def show_shape(shape: tuple[int, ...]) -> str:
+    return ",".join(map(str, reversed(shape)))
+
+
+def human_size(shape: tuple[int, ...], itemsize: int) -> str:
+    n = numpy.prod(shape) * itemsize
+    suffixes = "bytes", "KiB", "MiB", "GiB", "TiB", "PiB"
+    suffix = suffixes[0]
+    for suffix in suffixes:
+        if n < 1024:
+            break
+        n /= 1024
+    return f"{n:.2f} {suffix}"
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument(
+        "-p",
+        "--path",
+        help="path to HDF5 file (default: test-dataset.h5)",
+        default="test-dataset.h5",
+    )
+    ap.add_argument(
+        "-d", "--dataset", help="dataset name (default: /data)", default="/data"
+    )
+    ap.add_argument("-t", "--dtype", help="data type (default: uint8)", default="uint8")
+    ap.add_argument(
+        "-s",
+        "--shape",
+        help="comma-separated column-major shape (default: 64,64,3,64,10)",
+        default="64,64,3,64,10",
+    )
+    ap.add_argument(
+        "-c",
+        "--chunk",
+        help="comma-separated column-major chunk shape (default: no chunking)",
+    )
+    args = ap.parse_args()
+
+    path = args.path
+    dataset = "/" + args.dataset.removeprefix("/")
+    dtype = numpy.dtype(args.dtype)
+    shape = parse_shape(args.shape)
+    chunk = parse_shape(args.chunk) if args.chunk is not None else None
+
+    report = {
+        "path": path,
+        "dataset": dataset,
+        "dtype": str(dtype),
+        "shape": show_shape(shape),
+        "shape_size": human_size(shape, dtype.itemsize),
+    }
+    if chunk is not None:
+        report["chunk"] = show_shape(chunk)
+        report["chunk_size"] = human_size(chunk, dtype.itemsize)
+    print(json.dumps(report, indent=2))
+
+    with h5py.File(path, "w") as f:
+        ds = f.create_dataset(dataset, shape=shape, dtype=dtype, chunks=chunk)
+        ds.write_direct(numpy.zeros(shape, dtype=dtype))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/main/java/org/ilastik/ilastik4ij/hdf5/DatasetDescription.java b/src/main/java/org/ilastik/ilastik4ij/hdf5/DatasetDescription.java
@@ -0,0 +1,120 @@
+package org.ilastik.ilastik4ij.hdf5;
+
+import ch.systemsx.cisd.hdf5.HDF5DataSetInformation;
+import ch.systemsx.cisd.hdf5.IHDF5Reader;
+import hdf.hdf5lib.exceptions.HDF5AttributeException;
+import net.imagej.axis.AxisType;
+import org.ilastik.ilastik4ij.util.ImgUtils;
+import org.json.JSONException;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+
+import static org.ilastik.ilastik4ij.util.ImgUtils.guessAxes;
+import static org.ilastik.ilastik4ij.util.ImgUtils.parseAxes;
+
+/**
+ * Metadata for HDF5 dataset.
+ */
+public final class DatasetDescription {
+    /**
+     * Internal dataset path in a file.
+     */
+    public final String path;
+
+    /**
+     * Type of the dataset.
+     */
+    public final DatasetType type;
+
+    /**
+     * Dataset dimensions in the <em>column-major</em> order.
+     */
+    public final long[] dims;
+
+    /**
+     * Dimension axes.
+     */
+    public final List<AxisType> axes;
+
+    /**
+     * Whether {@link #axes} are read by {@link ImgUtils#parseAxes}
+     * or inferred with {@link ImgUtils#guessAxes}.
+     */
+    public boolean axesGuessed;
+
+    /**
+     * Try to get dataset description for HDF5 dataset.
+     */
+    static Optional<DatasetDescription> ofHdf5(IHDF5Reader reader, String path) {
+        Objects.requireNonNull(reader);
+        Objects.requireNonNull(path);
+
+        HDF5DataSetInformation info = reader.object().getDataSetInformation(path);
+
+        Optional<DatasetType> type = DatasetType.ofHdf5(info.getTypeInformation());
+        if (!type.isPresent()) {
+            return Optional.empty();
+        }
+
+        long[] dims = ImgUtils.reversed(info.getDimensions());
+        if (!(2 <= dims.length && dims.length <= 5)) {
+            return Optional.empty();
+        }
+
+        List<AxisType> axes;
+        boolean axesGuessed;
+        try {
+            axes = parseAxes(reader.string().getAttr(path, "axistags"));
+            axesGuessed = false;
+        } catch (HDF5AttributeException | JSONException ignored) {
+            axes = guessAxes(dims);
+            axesGuessed = true;
+        }
+
+        path = "/" + path.replaceFirst("/+", "");
+        return Optional.of(new DatasetDescription(path, type.get(), dims, axes, axesGuessed));
+    }
+
+    public DatasetDescription(
+            String path, DatasetType type, long[] dims, List<AxisType> axes, boolean axesGuessed) {
+        this.path = Objects.requireNonNull(path);
+        this.type = Objects.requireNonNull(type);
+        this.dims = Objects.requireNonNull(dims).clone();
+        this.axes = new ArrayList<>(Objects.requireNonNull(axes));
+        this.axesGuessed = axesGuessed;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        DatasetDescription that = (DatasetDescription) o;
+        return Objects.equals(path, that.path) &&
+                type == that.type &&
+                Arrays.equals(dims, that.dims) &&
+                Objects.equals(axes, that.axes) &&
+                axesGuessed == that.axesGuessed;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = Objects.hash(path, type, axes, axesGuessed);
+        result = 31 * result + Arrays.hashCode(dims);
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return String.format(
+                "DatasetDescription{path='%s', type=%s, dims=%s, axes=%s, axesGuessed=%s}",
+                path,
+                type,
+                Arrays.toString(dims),
+                axes,
+                axesGuessed);
+    }
+}