Skip to content

Commit

Permalink
Merge pull request #110 from ilastik/faster-hdf5
Browse files Browse the repository at this point in the history
Speed up HDF5 processing
  • Loading branch information
emilmelnikov committed Aug 24, 2023
2 parents c7f982c + 5737f05 commit 08c603e
Show file tree
Hide file tree
Showing 18 changed files with 1,535 additions and 944 deletions.
27 changes: 22 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.scijava</groupId>
<artifactId>pom-scijava</artifactId>
<version>36.0.0</version>
<relativePath />
<relativePath/>
</parent>

<groupId>org.ilastik</groupId>
Expand All @@ -24,7 +25,7 @@
<licenses>
<license>
<name>mit</name>
<url />
<url/>
<distribution>repo</distribution>
</license>
</licenses>
Expand All @@ -33,12 +34,12 @@
<developer>
<id>chaubold</id>
<name>Carsten Haubold</name>
<url />
<url/>
</developer>
<developer>
<id>awolny</id>
<name>Adrian Wolny</name>
<url />
<url/>
</developer>

</developers>
Expand Down Expand Up @@ -171,5 +172,21 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<!-- JUnit 5 -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
80 changes: 80 additions & 0 deletions scripts/generate-dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python

"""Generate HDF5 dataset of the specified size filled with zeros."""

import argparse
import json

import h5py
import numpy


def parse_shape(s: str) -> tuple[int, ...]:
return tuple(map(int, reversed(s.split(","))))


def show_shape(shape: tuple[int, ...]) -> str:
return ",".join(map(str, reversed(shape)))


def human_size(shape: tuple[int, ...], itemsize: int) -> str:
n = numpy.prod(shape) * itemsize
suffixes = "bytes", "KiB", "MiB", "GiB", "TiB", "PiB"
suffix = suffixes[0]
for suffix in suffixes:
if n < 1024:
break
n /= 1024
return f"{n:.2f} {suffix}"


def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument(
"-p",
"--path",
help="path to HDF5 file (default: test-dataset.h5)",
default="test-dataset.h5",
)
ap.add_argument(
"-d", "--dataset", help="dataset name (default: /data)", default="/data"
)
ap.add_argument("-t", "--dtype", help="data type (default: uint8)", default="uint8")
ap.add_argument(
"-s",
"--shape",
help="comma-separated column-major shape (default: 64,64,3,64,10)",
default="64,64,3,64,10",
)
ap.add_argument(
"-c",
"--chunk",
help="comma-separated column-major chunk shape (default: no chunking)",
)
args = ap.parse_args()

path = args.path
dataset = "/" + args.dataset.removeprefix("/")
dtype = numpy.dtype(args.dtype)
shape = parse_shape(args.shape)
chunk = parse_shape(args.chunk) if args.chunk is not None else None

report = {
"path": path,
"dataset": dataset,
"dtype": str(dtype),
"shape": show_shape(shape),
"shape_size": human_size(shape, dtype.itemsize),
}
if chunk is not None:
report["chunk"] = show_shape(chunk)
report["chunk_size"] = human_size(chunk, dtype.itemsize)
print(json.dumps(report, indent=2))

with h5py.File(path, "w") as f:
ds = f.create_dataset(dataset, shape=shape, dtype=dtype, chunks=chunk)
ds.write_direct(numpy.zeros(shape, dtype=dtype))


if __name__ == "__main__":
main()
120 changes: 120 additions & 0 deletions src/main/java/org/ilastik/ilastik4ij/hdf5/DatasetDescription.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package org.ilastik.ilastik4ij.hdf5;

import ch.systemsx.cisd.hdf5.HDF5DataSetInformation;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import hdf.hdf5lib.exceptions.HDF5AttributeException;
import net.imagej.axis.AxisType;
import org.ilastik.ilastik4ij.util.ImgUtils;
import org.json.JSONException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

import static org.ilastik.ilastik4ij.util.ImgUtils.guessAxes;
import static org.ilastik.ilastik4ij.util.ImgUtils.parseAxes;

/**
* Metadata for HDF5 dataset.
*/
public final class DatasetDescription {
/**
* Internal dataset path in a file.
*/
public final String path;

/**
* Type of the dataset.
*/
public final DatasetType type;

/**
* Dataset dimensions in the <em>column-major</em> order.
*/
public final long[] dims;

/**
* Dimension axes.
*/
public final List<AxisType> axes;

/**
* Whether {@link #axes} are read by {@link ImgUtils#parseAxes}
* or inferred with {@link ImgUtils#guessAxes}.
*/
public boolean axesGuessed;

/**
* Try to get dataset description for HDF5 dataset.
*/
static Optional<DatasetDescription> ofHdf5(IHDF5Reader reader, String path) {
Objects.requireNonNull(reader);
Objects.requireNonNull(path);

HDF5DataSetInformation info = reader.object().getDataSetInformation(path);

Optional<DatasetType> type = DatasetType.ofHdf5(info.getTypeInformation());
if (!type.isPresent()) {
return Optional.empty();
}

long[] dims = ImgUtils.reversed(info.getDimensions());
if (!(2 <= dims.length && dims.length <= 5)) {
return Optional.empty();
}

List<AxisType> axes;
boolean axesGuessed;
try {
axes = parseAxes(reader.string().getAttr(path, "axistags"));
axesGuessed = false;
} catch (HDF5AttributeException | JSONException ignored) {
axes = guessAxes(dims);
axesGuessed = true;
}

path = "/" + path.replaceFirst("/+", "");
return Optional.of(new DatasetDescription(path, type.get(), dims, axes, axesGuessed));
}

public DatasetDescription(
String path, DatasetType type, long[] dims, List<AxisType> axes, boolean axesGuessed) {
this.path = Objects.requireNonNull(path);
this.type = Objects.requireNonNull(type);
this.dims = Objects.requireNonNull(dims).clone();
this.axes = new ArrayList<>(Objects.requireNonNull(axes));
this.axesGuessed = axesGuessed;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DatasetDescription that = (DatasetDescription) o;
return Objects.equals(path, that.path) &&
type == that.type &&
Arrays.equals(dims, that.dims) &&
Objects.equals(axes, that.axes) &&
axesGuessed == that.axesGuessed;
}

@Override
public int hashCode() {
int result = Objects.hash(path, type, axes, axesGuessed);
result = 31 * result + Arrays.hashCode(dims);
return result;
}

@Override
public String toString() {
return String.format(
"DatasetDescription{path='%s', type=%s, dims=%s, axes=%s, axesGuessed=%s}",
path,
type,
Arrays.toString(dims),
axes,
axesGuessed);
}
}
Loading

0 comments on commit 08c603e

Please sign in to comment.