From d26656dd0d9f241ea110d60aab56248a0056228e Mon Sep 17 00:00:00 2001 From: k-dominik Date: Tue, 16 Apr 2024 13:29:26 +0200 Subject: [PATCH 01/13] remove `VigraRfAdaptiveMaskPixelwiseClassifier` - was never used --- .../vigraRfAdaptiveMaskPixelwiseClassifier.py | 220 ------------------ 1 file changed, 220 deletions(-) delete mode 100644 lazyflow/classifiers/vigraRfAdaptiveMaskPixelwiseClassifier.py diff --git a/lazyflow/classifiers/vigraRfAdaptiveMaskPixelwiseClassifier.py b/lazyflow/classifiers/vigraRfAdaptiveMaskPixelwiseClassifier.py deleted file mode 100644 index c3bdd9b2bc..0000000000 --- a/lazyflow/classifiers/vigraRfAdaptiveMaskPixelwiseClassifier.py +++ /dev/null @@ -1,220 +0,0 @@ -from __future__ import absolute_import -from future import standard_library - -standard_library.install_aliases() -from builtins import zip -from builtins import range -import os -import tempfile -import pickle as pickle - -import numpy -import scipy.ndimage -import vigra -import h5py - -from .lazyflowClassifier import LazyflowPixelwiseClassifierABC, LazyflowPixelwiseClassifierFactoryABC -from lazyflow.utility import Timer -from lazyflow.utility.helpers import bigintprod - -import logging - -logger = logging.getLogger(__name__) - - -class VigraRfAdaptiveMaskPixelwiseClassifierFactory(LazyflowPixelwiseClassifierFactoryABC): - """ - An implementation of LazyflowPixelwiseClassifierFactoryABC using a vigra RandomForest with adaptive masking. - This exists for testing purposes only. (it is normally better to use the vector-wise - classifier so lazyflow can cache the feature matrices). - This implementation is simple and un-optimized. - """ - - VERSION = 1 # This is used to determine compatibility of pickled classifier factories. - # You must bump this if any instance members are added/removed/renamed. - - def __init__(self, *args, **kwargs): - self._args = args - self._kwargs = kwargs - - def create_and_train_pixelwise(self, feature_images, label_images, axistags=None): - logger.debug("training pixel-wise vigra RF") - - all_features = numpy.ndarray(shape=(0, feature_images[0].shape[-1]), dtype=numpy.float32) - all_labels = numpy.ndarray(shape=(0, 1), dtype=numpy.uint32) - - # Extract label points and corresponding feature vectors - for feature_image, label_image in zip(feature_images, label_images): - label_coords = numpy.nonzero(label_image)[:-1] # discard channel - label_vector = label_image[label_coords] - feature_matrix = feature_image[label_coords] - all_features = numpy.concatenate((all_features, feature_matrix), axis=0) - all_labels = numpy.concatenate((all_labels, label_vector), axis=0) - - # Save for future reference - known_labels = numpy.sort(vigra.analysis.unique(all_labels)) - - assert len(all_features) == len(all_labels) - classifier = vigra.learning.RandomForest(*self._args, **self._kwargs) - classifier.learnRF(all_features, all_labels) - - return VigraRfAdaptiveMaskPixelwiseClassifier(classifier, known_labels) - - def get_halo_shape(self, data_axes): - # No halo necessary, but since this classifier is for testing purposes, let's add one anyway. - halo = tuple(range(len(data_axes) - 1)) - return halo + (0,) # (no channel halo) - - def estimated_ram_usage_per_requested_predictionchannel(self): - return 4 - - @property - def description(self): - temp_rf = vigra.learning.RandomForest(*self._args, **self._kwargs) - return "Vigra Random Forest ({} trees)".format(temp_rf.treeCount()) - - def __eq__(self, other): - return isinstance(other, type(self)) and self._args == other._args and self._kwargs == other._kwargs - - def __ne__(self, other): - return not self.__eq__(other) - - -assert issubclass(VigraRfAdaptiveMaskPixelwiseClassifierFactory, LazyflowPixelwiseClassifierFactoryABC) - - -class VigraRfAdaptiveMaskPixelwiseClassifier(LazyflowPixelwiseClassifierABC): - """ - This class adapt the vigra RandomForest class to the interface expected by lazyflow, and implements pixel-wise adaptive masking for prediction. - - TODO: - -Find an alternative to scipy binary dilation, since the running-time takes too long. - -Enable the user to set the parameters FRAME_SPAN, DILATION_RADIUS, and BACKGROUND_LABEL in the class constructor. - -Parallelize prediction. Right now it is single-threaded. - -Take into account the image borders with spatially-divided blocks. Right now masking only works with full frames. - """ - - def __init__(self, vigra_rf, known_labels): - self._known_labels = known_labels - self._vigra_rf = vigra_rf - - def predict_probabilities_pixelwise(self, X, roi, axistags=None): - logger.debug("predicting PIXELWISE vigra RF") - - # This classifier doesn't benefit from any context around the input, (does it?) - # so just strip it off and only use the given roi. - assert len(roi[0]) == len(roi[1]) == X.ndim - 1 - X = X[roi_to_slice(*roi)] - - FRAME_SPAN = 10 # Number of frames to wait until the mask is recalculated - DILATION_RADIUS = 50 # In pixels - BACKGROUND_LABEL = 1 - - # Allocate memory for probability volume and mask - prob_vol = numpy.zeros((X.shape[:-1] + (len(self._known_labels),)), dtype=numpy.float32) - mask = numpy.ones(bigintprod(X.shape[1:-1]), dtype=bool) - - frm_cnt = 0 - - for X_t in X: - if frm_cnt % FRAME_SPAN == 0: - mask = numpy.ones(bigintprod(X.shape[1:-1]), dtype=bool) - - prob_mat = numpy.zeros((bigintprod(X.shape[1:-1]), len(self._known_labels)), dtype=numpy.float32) - - # Reshape the image into a 2D feature matrix - mat_shape = (bigintprod(X_t.shape[:-1]), X_t.shape[-1]) - feature_mat = numpy.reshape(X_t, mat_shape) - - # Mask the feature matrix - feature_mat_masked = feature_mat[mask == 1, :] - - # Run classifier - prob_mat_masked = self._vigra_rf.predictProbabilities(feature_mat_masked.view(numpy.ndarray)) - - prob_mat[mask == 1, :] = prob_mat_masked - prob_mat[mask == 0, 0] = 1.0 # Fill background - - prob_img = prob_mat.reshape((1,) + X_t.shape[:-1] + (prob_mat.shape[-1],)) - - # Recalculate the mask every 20 frames - if frm_cnt % FRAME_SPAN == 0: - predicted_labels = numpy.argmax(prob_img[0], axis=-1) + 1 - prob_slice = (predicted_labels != BACKGROUND_LABEL).astype(bool) - - kernel = numpy.ones((DILATION_RADIUS * 2 + 1), dtype=bool) - - with Timer() as morpho_timer: - prob_slice_dilated = scipy.ndimage.morphology.binary_dilation(prob_slice, kernel[None, :]) - prob_slice_dilated = scipy.ndimage.morphology.binary_dilation(prob_slice_dilated, kernel[:, None]) - - logger.debug("[PROF] Morphology took {} ".format(morpho_timer.seconds())) - - mask = prob_slice_dilated.reshape(bigintprod(prob_slice_dilated.shape)) - - prob_vol[frm_cnt, :, :, :] = prob_img - - frm_cnt = frm_cnt + 1 - - # Reshape into an image. - # Choose the prediction image shape carefully: - # - # Most classifiers omit a channel entirely if there are no labels given for a particular class, - # So the number of prediction channels we got is the same as the number of known_classes - # But if the classifier attempts to "help us out" by including channels for "missing" labels, - # then we want to just return the whole thing. - num_probability_channels = max(len(self.known_classes), prob_vol.shape[-1]) - - prediction_shape = X.shape[:-1] + (num_probability_channels,) - return numpy.reshape(prob_vol, prediction_shape) - - @property - def known_classes(self): - return self._known_labels - - @property - def feature_count(self): - return self._vigra_rf.featureCount() - - def get_halo_shape(self, data_axes): - # No halo necessary, but since this classifier is for testing purposes, let's add one anyway. - halo = tuple(range(len(data_axes) - 1)) - return halo + (0,) # (no channel halo) - - def serialize_hdf5(self, h5py_group): - # Due to non-shared hdf5 dlls, vigra can't write directly to - # our open hdf5 group. Instead, we'll use vigra to write the - # classifier to a temporary file. - tmpDir = tempfile.mkdtemp() - cachePath = os.path.join(tmpDir, "tmp_classifier_cache.h5").replace("\\", "/") - self._vigra_rf.writeHDF5(cachePath, "forest") - - # Open the temp file and copy to our project group - with h5py.File(cachePath, "r") as cacheFile: - h5py_group.copy(cacheFile["forest"], "forest") - - h5py_group["known_labels"] = self._known_labels - - # This field is required for all classifiers - h5py_group["pickled_type"] = pickle.dumps(type(self), 0) - - @classmethod - def deserialize_hdf5(cls, h5py_group): - # Due to non-shared hdf5 dlls, vigra can't read directly - # from our open hdf5 group. Instead, we'll copy the - # classfier data to a temporary file and give it to vigra. - tmpDir = tempfile.mkdtemp() - cachePath = os.path.join(tmpDir, "tmp_classifier_cache.h5").replace("\\", "/") - with h5py.File(cachePath, "w") as cacheFile: - cacheFile.copy(h5py_group, "forest") - - forest = vigra.learning.RandomForest(cachePath, "forest") - known_labels = list(h5py_group["known_labels"][:]) - - os.remove(cachePath) - os.rmdir(tmpDir) - - return VigraRfAdaptiveMaskPixelwiseClassifier(forest, known_labels) - - -assert issubclass(VigraRfAdaptiveMaskPixelwiseClassifier, LazyflowPixelwiseClassifierABC) From cbb8b3dc783b05538cecdeda10d64b4119bdd9af Mon Sep 17 00:00:00 2001 From: k-dominik Date: Tue, 16 Apr 2024 13:38:35 +0200 Subject: [PATCH 02/13] Refactor: use h5py.group.require_group instead getOrCreateGroup no need to wrap this functionality (and leave testing it to h5py). Added some minor type annotations to help with the refactor. Also removed unused imports in files touched anyways. --- .../applets/base/appletSerializer/__init__.py | 1 - .../base/appletSerializer/appletSerializer.py | 24 ++++---- .../base/appletSerializer/serializerUtils.py | 13 +---- .../base/appletSerializer/slotSerializer.py | 37 +++++++----- .../dataSelection/dataSelectionSerializer.py | 6 +- .../objectExtractionSerializer.py | 14 ++--- .../annotations/annotationsSerializer.py | 35 ++++++----- .../manual/manualTrackingSerializer.py | 9 ++- .../workflows/carving/carvingSerializer.py | 7 +-- .../carving/preprocessingSerializer.py | 4 +- .../test_applets/base/testSerializer.py | 58 ------------------- .../test_applets/base/test_serializerUtils.py | 22 +++++++ 12 files changed, 92 insertions(+), 138 deletions(-) create mode 100644 tests/test_ilastik/test_applets/base/test_serializerUtils.py diff --git a/ilastik/applets/base/appletSerializer/__init__.py b/ilastik/applets/base/appletSerializer/__init__.py index d0920a4bda..9e80b401a7 100644 --- a/ilastik/applets/base/appletSerializer/__init__.py +++ b/ilastik/applets/base/appletSerializer/__init__.py @@ -20,7 +20,6 @@ ############################################################################### from .appletSerializer import AppletSerializer as AppletSerializer from .serializerUtils import deleteIfPresent as deleteIfPresent -from .serializerUtils import getOrCreateGroup as getOrCreateGroup from .slotSerializer import JSONSerialSlot as JSONSerialSlot from .slotSerializer import SerialBlockSlot as SerialBlockSlot from .slotSerializer import SerialClassifierFactorySlot as SerialClassifierFactorySlot diff --git a/ilastik/applets/base/appletSerializer/appletSerializer.py b/ilastik/applets/base/appletSerializer/appletSerializer.py index 3ac46c596d..0911a8ccca 100644 --- a/ilastik/applets/base/appletSerializer/appletSerializer.py +++ b/ilastik/applets/base/appletSerializer/appletSerializer.py @@ -19,20 +19,20 @@ # http://ilastik.org/license.html ############################################################################### import logging -from abc import ABCMeta +from abc import ABC -from future.utils import with_metaclass +import h5py from ilastik.config import cfg as ilastik_config from ilastik.utility.maybe import maybe from lazyflow.utility.orderedSignal import OrderedSignal -from .serializerUtils import deleteIfPresent, getOrCreateGroup +from .serializerUtils import deleteIfPresent logger = logging.getLogger(__name__) -class AppletSerializer(with_metaclass(ABCMeta, object)): +class AppletSerializer(ABC): """ Base class for all AppletSerializers. """ @@ -49,14 +49,16 @@ class IncompatibleProjectVersionError(Exception): # Semi-abstract methods # ######################### - def _serializeToHdf5(self, topGroup, hdf5File, projectFilePath): + def _serializeToHdf5(self, topGroup: h5py.Group, hdf5File: h5py.File, projectFilePath): """Child classes should override this function, if necessary. """ pass - def _deserializeFromHdf5(self, topGroup, groupVersion, hdf5File, projectFilePath, headless=False): + def _deserializeFromHdf5( + self, topGroup: h5py.Group, groupVersion, hdf5File: h5py.File, projectFilePath, headless=False + ): """Child classes should override this function, if necessary. @@ -67,7 +69,7 @@ def _deserializeFromHdf5(self, topGroup, groupVersion, hdf5File, projectFilePath # Base class implementation # ############################# - def __init__(self, topGroupName, slots=None, operator=None): + def __init__(self, topGroupName: str, slots=None, operator=None): """Constructor. Subclasses must call this method in their own __init__ functions. If they fail to do so, the shell raises an exception. @@ -95,7 +97,7 @@ def isDirty(self): """ return any(list(ss.dirty for ss in self.serialSlots)) - def shouldSerialize(self, hdf5File): + def shouldSerialize(self, hdf5File: h5py.File): """Whether to serialize or not.""" if self.isDirty(): @@ -104,7 +106,7 @@ def shouldSerialize(self, hdf5File): # Need to check if slots should be serialized. First must verify that self.topGroupName is not an empty string # (as this seems to happen sometimes). if self.topGroupName: - topGroup = getOrCreateGroup(hdf5File, self.topGroupName) + topGroup = hdf5File.require_group(self.topGroupName) return any([ss.shouldSerialize(topGroup) for ss in self.serialSlots]) return False @@ -135,7 +137,7 @@ def progressIncrement(self, group=None): return 0 return divmod(100, nslots)[0] - def serializeToHdf5(self, hdf5File, projectFilePath): + def serializeToHdf5(self, hdf5File: h5py.File, projectFilePath): """Serialize the current applet state to the given hdf5 file. Subclasses should **not** override this method. Instead, @@ -148,7 +150,7 @@ def serializeToHdf5(self, hdf5File, projectFilePath): (Most serializers do not use this parameter.) """ - topGroup = getOrCreateGroup(hdf5File, self.topGroupName) + topGroup = hdf5File.require_group(self.topGroupName) progress = 0 self.progressSignal(progress) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index ba758b92bd..64aa35cde4 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -18,18 +18,7 @@ # on the ilastik web site at: # http://ilastik.org/license.html ############################################################################### - - -def getOrCreateGroup(parentGroup, groupName): - """Returns parentGroup[groupName], creating first it if - necessary. - - """ - - return parentGroup.require_group(groupName) - - -def deleteIfPresent(parentGroup, name): +def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: """Deletes parentGroup[name], if it exists.""" # Check first. If we try to delete a non-existent key, # hdf5 will complain on the console. diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 6442b3c3f5..2ddbb83f9b 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -25,7 +25,7 @@ import re import tempfile import warnings -from typing import List, Tuple +from typing import Any, List, Optional, Tuple import h5py import numpy @@ -46,7 +46,16 @@ class SerialSlot(object): """Implements the logic for serializing a slot.""" - def __init__(self, slot, inslot=None, name=None, subname=None, default=None, depends=None, selfdepends=True): + def __init__( + self, + slot: Slot, + inslot: Optional[Slot] = None, + name: Optional[str] = None, + subname: Optional[str] = None, + default: Any = None, + depends: Optional[List[Slot]] = None, + selfdepends: bool = True, + ): """ :param slot: where to get data to save @@ -76,7 +85,7 @@ def __init__(self, slot, inslot=None, name=None, subname=None, default=None, dep self.inslot = inslot self.default = default - self.depends = maybe(depends, []) + self.depends: List[Slot] = maybe(depends, []) if selfdepends: self.depends.append(slot) if name is None: @@ -86,23 +95,23 @@ def __init__(self, slot, inslot=None, name=None, subname=None, default=None, dep subname = "{:04d}" self.subname = subname - self._dirty = False + self._dirty: bool = False self._bind() - self.ignoreDirty = False + self.ignoreDirty: bool = False @property def dirty(self): return self._dirty @dirty.setter - def dirty(self, isDirty): + def dirty(self, isDirty: bool): if not isDirty or (isDirty and not self.ignoreDirty): self._dirty = isDirty def setDirty(self, *args, **kwargs): self.dirty = True - def _bind(self, slot=None): + def _bind(self, slot: Optional[Slot] = None): """Setup so that when slot is dirty, set appropriate dirty flag. @@ -120,7 +129,7 @@ def doMulti(slot, index, size): slot.notifyInserted(doMulti) slot.notifyRemoved(self.setDirty) - def shouldSerialize(self, group): + def shouldSerialize(self, group: h5py.Group): """Whether to serialize or not.""" result = self.dirty result |= self.name not in list(group.keys()) @@ -128,7 +137,7 @@ def shouldSerialize(self, group): result &= s.ready() return result - def serialize(self, group): + def serialize(self, group: h5py.Group): """Performs tasks common to all serializations, like changing dirty status. @@ -149,7 +158,7 @@ def serialize(self, group): self.dirty = False @staticmethod - def _saveValue(group, name, value): + def _saveValue(group: h5py.Group, name: str, value): """Separate so that subclasses can override, if necessary. For instance, SerialListSlot needs to save an extra attribute @@ -161,7 +170,7 @@ def _saveValue(group, name, value): value = value.encode("utf-8") group.create_dataset(name, data=value) - def _serialize(self, group, name, slot): + def _serialize(self, group: h5py.Group, name: str, slot): """ :param group: The parent group. :type group: h5py.Group @@ -182,7 +191,7 @@ def _serialize(self, group, name, slot): subname = self.subname.format(i) self._serialize(subgroup, subname, slot[i]) - def deserialize(self, group): + def deserialize(self, group: h5py.Group): """Performs tasks common to all deserializations. Do not override (unless for some reason this function does not @@ -200,14 +209,14 @@ def deserialize(self, group): self.dirty = False @staticmethod - def _getValue(subgroup, slot): + def _getValue(subgroup: h5py.Group, slot: Slot): val = subgroup[()] if isinstance(val, bytes): # h5py can't store unicode, so we store all strings as encoded utf-8 bytes val = val.decode("utf-8") slot.setValue(val) - def _deserialize(self, subgroup, slot): + def _deserialize(self, subgroup: h5py.Group, slot: Slot): """ :param subgroup: *not* the parent group. This slot's group. :type subgroup: h5py.Group diff --git a/ilastik/applets/dataSelection/dataSelectionSerializer.py b/ilastik/applets/dataSelection/dataSelectionSerializer.py index 76348bd283..0485fc0533 100644 --- a/ilastik/applets/dataSelection/dataSelectionSerializer.py +++ b/ilastik/applets/dataSelection/dataSelectionSerializer.py @@ -25,7 +25,7 @@ import vigra import ilastik.utility.globals -from ilastik.applets.base.appletSerializer import AppletSerializer, getOrCreateGroup, deleteIfPresent +from ilastik.applets.base.appletSerializer import AppletSerializer, deleteIfPresent from ilastik.exceptions import UserAbort from ilastik.utility import bind from lazyflow.utility import PathComponents @@ -114,13 +114,13 @@ def handleNewLane(multislot, laneIndex): @timeLogged(logger, logging.DEBUG) def _serializeToHdf5(self, topGroup, hdf5File, projectFilePath): - getOrCreateGroup(topGroup, "local_data") + topGroup.require_group("local_data") deleteIfPresent(topGroup, "Role Names") role_names = [name.encode("utf-8") for name in self.topLevelOperator.DatasetRoles.value] topGroup.create_dataset("Role Names", data=role_names) # Access the info group - infoDir = getOrCreateGroup(topGroup, "infos") + infoDir = topGroup.require_group("infos") # Delete all infos infoDir.clear() diff --git a/ilastik/applets/objectExtraction/objectExtractionSerializer.py b/ilastik/applets/objectExtraction/objectExtractionSerializer.py index d2fdaa3a1e..45d276f19f 100644 --- a/ilastik/applets/objectExtraction/objectExtractionSerializer.py +++ b/ilastik/applets/objectExtraction/objectExtractionSerializer.py @@ -18,21 +18,15 @@ # on the ilastik web site at: # http://ilastik.org/license.html ############################################################################### -from builtins import range import logging -import warnings -from functools import partial import numpy -from lazyflow.rtype import SubRegion -from lazyflow.roi import getIntersectingBlocks, TinyVector, getBlockBounds, roiToSlice -from lazyflow.request import Request, RequestLock, RequestPool +from lazyflow.roi import roiToSlice from ilastik.applets.base.appletSerializer import ( AppletSerializer, deleteIfPresent, - getOrCreateGroup, SerialSlot, SerialBlockSlot, SerialObjectFeatureNamesSlot, @@ -53,11 +47,11 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator for i in range(len(mainOperator)): - subgroup = getOrCreateGroup(group, "{:04}".format(i)) + subgroup = group.require_group(f"{i:04d}") cleanBlockRois = self.blockslot[i].value for roi in cleanBlockRois: @@ -68,7 +62,7 @@ def serialize(self, group): roi_grp = subgroup.create_group(name=str(roi_string)) logger.debug('Saving region features into group: "{}"'.format(roi_grp.name)) for key, val in region_features.items(): - plugin_group = getOrCreateGroup(roi_grp, key) + plugin_group = roi_grp.require_group(key) for featname, featval in val.items(): plugin_group.create_dataset(name=featname, data=featval) diff --git a/ilastik/applets/tracking/annotations/annotationsSerializer.py b/ilastik/applets/tracking/annotations/annotationsSerializer.py index 5b0417b69e..9c36977a33 100644 --- a/ilastik/applets/tracking/annotations/annotationsSerializer.py +++ b/ilastik/applets/tracking/annotations/annotationsSerializer.py @@ -22,7 +22,6 @@ AppletSerializer, SerialSlot, deleteIfPresent, - getOrCreateGroup, ) @@ -31,22 +30,22 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): - gr = getOrCreateGroup(group, str(i)) - labels_gr = getOrCreateGroup(gr, str("labels")) + gr = group.require_group(str(i)) + labels_gr = gr.require_group(str("labels")) if "labels" in op.Annotations.value.keys(): for t in op.Annotations.value["labels"].keys(): - t_gr = getOrCreateGroup(labels_gr, str(t)) + t_gr = labels_gr.require_group(str(t)) for oid in op.Annotations.value["labels"][t].keys(): l = op.Annotations.value["labels"][t][oid] dset = list(l) if len(dset) > 0: t_gr.create_dataset(name=str(oid), data=dset) - divisions_gr = getOrCreateGroup(gr, str("divisions")) + divisions_gr = gr.require_group(str("divisions")) dset = [] if "divisions" in op.Annotations.value.keys(): for trackid in op.Annotations.value["divisions"].keys(): @@ -94,7 +93,7 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): @@ -127,13 +126,13 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): - gr = getOrCreateGroup(group, str(i)) + gr = group.require_group(str(i)) for t in list(op.labels.keys()): - t_gr = getOrCreateGroup(gr, str(t)) + t_gr = gr.require_group(str(t)) for oid in list(op.labels[t].keys()): l = op.labels[t][oid] dset = list(l) @@ -165,15 +164,15 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): - gr = getOrCreateGroup(group, str(i)) + gr = group.require_group(str(i)) for t in list(op.appearances.keys()): - t_gr = getOrCreateGroup(gr, str(t)) + t_gr = gr.require_group(str(t)) for oid in list(op.appearances[t].keys()): - oid_gr = getOrCreateGroup(t_gr, str(oid)) + oid_gr = t_gr.require_group(str(oid)) for track in list(op.appearances[t][oid].keys()): app = op.appearances[t][oid][track] if app: @@ -208,15 +207,15 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): - gr = getOrCreateGroup(group, str(i)) + gr = group.require_group(str(i)) for t in list(op.disappearances.keys()): - t_gr = getOrCreateGroup(gr, str(t)) + t_gr = gr.require_group(str(t)) for oid in list(op.disappearances[t].keys()): - oid_gr = getOrCreateGroup(t_gr, str(oid)) + oid_gr = t_gr.require_group(str(oid)) for track in list(op.disappearances[t][oid].keys()): app = op.disappearances[t][oid][track] if app: diff --git a/ilastik/applets/tracking/manual/manualTrackingSerializer.py b/ilastik/applets/tracking/manual/manualTrackingSerializer.py index 7b688b694b..31f3c73dfe 100644 --- a/ilastik/applets/tracking/manual/manualTrackingSerializer.py +++ b/ilastik/applets/tracking/manual/manualTrackingSerializer.py @@ -22,7 +22,6 @@ AppletSerializer, SerialSlot, deleteIfPresent, - getOrCreateGroup, ) @@ -31,7 +30,7 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): @@ -64,13 +63,13 @@ def serialize(self, group): if not self.shouldSerialize(group): return deleteIfPresent(group, self.name) - group = getOrCreateGroup(group, self.name) + group = group.require_group(self.name) mainOperator = self.slot.operator innerops = mainOperator.innerOperators for i, op in enumerate(innerops): - gr = getOrCreateGroup(group, str(i)) + gr = group.require_group(str(i)) for t in list(op.labels.keys()): - t_gr = getOrCreateGroup(gr, str(t)) + t_gr = gr.require_group(str(t)) for oid in list(op.labels[t].keys()): l = op.labels[t][oid] dset = list(l) diff --git a/ilastik/workflows/carving/carvingSerializer.py b/ilastik/workflows/carving/carvingSerializer.py index b56ee73c64..10779df6df 100644 --- a/ilastik/workflows/carving/carvingSerializer.py +++ b/ilastik/workflows/carving/carvingSerializer.py @@ -20,8 +20,7 @@ ############################################################################### from typing import TYPE_CHECKING -from builtins import range -from ilastik.applets.base.appletSerializer import AppletSerializer, getOrCreateGroup, deleteIfPresent, SerialSlot +from ilastik.applets.base.appletSerializer import AppletSerializer, deleteIfPresent, SerialSlot import numpy from lazyflow.roi import roiFromShape, roiToSlice @@ -40,7 +39,7 @@ def __init__(self, operator: "OpCarving", groupName): self._o = operator def _serializeToHdf5(self, topGroup, hdf5File, projectFilePath): - obj = getOrCreateGroup(topGroup, "objects") + obj = topGroup.require_group("objects") for imageIndex, opCarving in enumerate(self._o.innerOperators): mst = opCarving._mst @@ -72,7 +71,7 @@ def _serializeToHdf5(self, topGroup, hdf5File, projectFilePath): deleteIfPresent(obj, name) continue - g = getOrCreateGroup(obj, name) + g = obj.require_group(name) deleteIfPresent(g, "fg_voxels") deleteIfPresent(g, "bg_voxels") deleteIfPresent(g, "sv") diff --git a/ilastik/workflows/carving/preprocessingSerializer.py b/ilastik/workflows/carving/preprocessingSerializer.py index ecd9162885..daee1307d8 100644 --- a/ilastik/workflows/carving/preprocessingSerializer.py +++ b/ilastik/workflows/carving/preprocessingSerializer.py @@ -20,7 +20,7 @@ # on the ilastik web site at: # http://ilastik.org/license.html ############################################################################### -from ilastik.applets.base.appletSerializer import AppletSerializer, getOrCreateGroup, deleteIfPresent +from ilastik.applets.base.appletSerializer import AppletSerializer, deleteIfPresent import h5py import numpy import os @@ -57,7 +57,7 @@ def _serializeToHdf5(self, topGroup, hdf5File, projectFilePath): preproc.create_dataset("size_regularizer", data=opPre.cachedSizeRegularizer) preproc.create_dataset("reduce_to", data=opPre.cachedReduceTo) - preprocgraph = getOrCreateGroup(preproc, "graph") + preprocgraph = preproc.require_group("graph") mst.saveH5G(preprocgraph) opPre.hasUnsavedData = False diff --git a/tests/test_ilastik/test_applets/base/testSerializer.py b/tests/test_ilastik/test_applets/base/testSerializer.py index 792ab7efdc..338a3022f5 100644 --- a/tests/test_ilastik/test_applets/base/testSerializer.py +++ b/tests/test_ilastik/test_applets/base/testSerializer.py @@ -26,19 +26,15 @@ import shutil import tempfile import pytest -from pathlib import Path from copy import deepcopy from ilastik.applets.base.appletSerializer import SerialObjectFeatureNamesSlot from lazyflow.graph import Graph, Operator, InputSlot, Slot, OperatorWrapper from lazyflow.operators import OpCompressedUserLabelArray -from lazyflow.operators.opArrayPiper import OpArrayPiper from lazyflow.stype import Opaque from lazyflow.rtype import List from ilastik.applets.base.appletSerializer import jsonSerializerRegistry from ilastik.applets.base.appletSerializer import ( - getOrCreateGroup, - deleteIfPresent, SerialSlot, SerialListSlot, AppletSerializer, @@ -77,60 +73,6 @@ def randArray(): return numpy.random.randn(10, 10) -class TestHDF5HelperFunctions(unittest.TestCase): - def setUp(self): - self.tmpDir = tempfile.mkdtemp() - self.tmpFile = h5py.File(os.path.join(self.tmpDir, "test.h5"), "a") - self.tmpFile.create_group("a") - self.tmpFile.create_dataset("c", (2, 2), dtype=numpy.int64) - - def test_getOrCreateGroup_1(self): - self.assertTrue("a" in self.tmpFile) - self.assertTrue(isinstance(self.tmpFile["a"], h5py.Group)) - - group = getOrCreateGroup(self.tmpFile, "a") - - self.assertEqual(group.name, "/a") - self.assertTrue("a" in self.tmpFile) - self.assertTrue(isinstance(self.tmpFile["a"], h5py.Group)) - - def test_getOrCreateGroup_2(self): - self.assertTrue("b" not in self.tmpFile) - - group = getOrCreateGroup(self.tmpFile, "b") - - self.assertEqual(group.name, "/b") - self.assertTrue("b" in self.tmpFile) - self.assertTrue(isinstance(self.tmpFile["b"], h5py.Group)) - - def test_getOrCreateGroup_3(self): - self.assertTrue("c" in self.tmpFile) - self.assertTrue(isinstance(self.tmpFile["c"], h5py.Dataset)) - - self.assertRaises(TypeError, lambda: getOrCreateGroup(self.tmpFile, "c")) - - self.assertTrue("c" in self.tmpFile) - self.assertTrue(isinstance(self.tmpFile["c"], h5py.Dataset)) - - def test_deleteIfPresent_1(self): - self.assertTrue("a" in self.tmpFile) - - deleteIfPresent(self.tmpFile, "a") - - self.assertTrue("a" not in self.tmpFile) - - def test_deleteIfPresent_2(self): - self.assertTrue("b" not in self.tmpFile) - - deleteIfPresent(self.tmpFile, "b") - - self.assertTrue("b" not in self.tmpFile) - - def tearDown(self): - self.tmpFile.close() - shutil.rmtree(self.tmpDir) - - class TestSerializer(unittest.TestCase): def setUp(self): g = Graph() diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py new file mode 100644 index 0000000000..d70580c404 --- /dev/null +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -0,0 +1,22 @@ +import h5py + +from ilastik.applets.base.appletSerializer.serializerUtils import deleteIfPresent + + +def test_deleteIfPresent_present(empty_in_memory_project_file: h5py.File): + test_group_name = "test_group_42" + _ = empty_in_memory_project_file.create_group(test_group_name) + assert test_group_name in empty_in_memory_project_file + + deleteIfPresent(empty_in_memory_project_file, test_group_name) + + assert test_group_name not in empty_in_memory_project_file + + +def test_deleteIfPresent_not_present(empty_in_memory_project_file: h5py.File): + test_group_name = "test_group_42" + assert test_group_name not in empty_in_memory_project_file + + deleteIfPresent(empty_in_memory_project_file, test_group_name) + + assert test_group_name not in empty_in_memory_project_file From 929b7ac63c5c1f17ed3b38a9bba796662f6a0005 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Tue, 16 Apr 2024 13:45:58 +0200 Subject: [PATCH 03/13] Add tests for slice serializers there were also some implicit assumptions on the slices being serialized that would in part silently be ignored -> those raise now. --- .../base/appletSerializer/serializerUtils.py | 20 ++++- .../test_applets/base/test_serializerUtils.py | 81 ++++++++++++++++++- 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index 64aa35cde4..ba271e363d 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -26,12 +26,24 @@ def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: del parentGroup[name] -def slicingToString(slicing): +def slicingToString(slicing: Sequence[slice]) -> bytes: """Convert the given slicing into a string of the form '[0:1,2:3,4:5]' + slices need to have integer start and stop values, step-size of 1 + is assumed + The result is a utf-8 encoded bytes, for easy storage via h5py """ + if any(sl.step not in [None, 1] for sl in slicing): + raise ValueError("Only slices with step size of `1` or `None` are supported.") + + if any(sl.start == None for sl in slicing): + raise ValueError("Start indices for slicing must be integer, got `None`.") + + if any(sl.stop == None for sl in slicing): + raise ValueError("Stop indices for slicing must be integer, got `None`.") + strSlicing = "[" for s in slicing: strSlicing += str(s.start) @@ -44,7 +56,7 @@ def slicingToString(slicing): return strSlicing.encode("utf-8") -def stringToSlicing(strSlicing): +def stringToSlicing(strSlicing: Union[bytes, str]) -> Tuple[slice, ...]: """Parse a string of the form '[0:1,2:3,4:5]' into a slicing (i.e. tuple of slices) @@ -52,11 +64,15 @@ def stringToSlicing(strSlicing): if isinstance(strSlicing, bytes): strSlicing = strSlicing.decode("utf-8") + assert isinstance(strSlicing, str) + slicing = [] strSlicing = strSlicing[1:-1] # Drop brackets sliceStrings = strSlicing.split(",") for s in sliceStrings: ends = s.split(":") + if len(ends) != 2: + raise ValueError(f"Did not expect slice element of form {s}") start = int(ends[0]) stop = int(ends[1]) slicing.append(slice(start, stop)) diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index d70580c404..946b0c00eb 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -1,6 +1,12 @@ +from typing import Sequence, Tuple, Union import h5py +import pytest -from ilastik.applets.base.appletSerializer.serializerUtils import deleteIfPresent +from ilastik.applets.base.appletSerializer.serializerUtils import ( + deleteIfPresent, + slicingToString, + stringToSlicing, +) def test_deleteIfPresent_present(empty_in_memory_project_file: h5py.File): @@ -20,3 +26,76 @@ def test_deleteIfPresent_not_present(empty_in_memory_project_file: h5py.File): deleteIfPresent(empty_in_memory_project_file, test_group_name) assert test_group_name not in empty_in_memory_project_file + + +@pytest.mark.parametrize( + "slicing,expected_string", + [ + ((slice(0, 1),), b"[0:1]"), + ((slice(0, 1), slice(5, 42)), b"[0:1,5:42]"), + ], +) +def test_slicingToString(slicing: Sequence[slice], expected_string: bytes): + assert slicingToString(slicing) == expected_string + + +@pytest.mark.parametrize( + "slicing", + [ + (slice(0, 1, 5),), + (slice(0, 1), slice(5, 42, 13)), + ], +) +def test_slicingToString_invalid_step_raises(slicing): + with pytest.raises(ValueError, match="Only slices with step size of `1` or `None` are supported."): + _ = slicingToString(slicing) + + +@pytest.mark.parametrize( + "slicing", + [ + (slice(None, 1),), + (slice(None, 1), slice(5, 42)), + (slice(0, 1), slice(None, 42)), + ], +) +def test_slicingToString_start_none_raises(slicing): + with pytest.raises(ValueError, match="Start indices for slicing must be integer, got `None`."): + _ = slicingToString(slicing) + + +@pytest.mark.parametrize( + "slicing", + [ + (slice(0, None),), + (slice(0, None), slice(5, 42, None)), + ], +) +def test_slicingToString_stop_none_raises(slicing): + with pytest.raises(ValueError, match="Stop indices for slicing must be integer, got `None`"): + _ = slicingToString(slicing) + + +@pytest.mark.parametrize( + "slice_string,expected_slicing", + [ + (b"[0:1]", (slice(0, 1),)), + (b"[0:1,5:42]", (slice(0, 1), slice(5, 42))), + ("[0:1,5:42]", (slice(0, 1), slice(5, 42))), + ], +) +def test_stringToSlicing(slice_string: Union[bytes, str], expected_slicing: Tuple[slice, ...]): + assert stringToSlicing(slice_string) == expected_slicing + + +@pytest.mark.parametrize( + "slice_string", + [ + b"[0:None]", + b"[None:1,5:42]", + "[0:1:5,5:42]", + ], +) +def test_stringToSlicing_raises(slice_string: Union[bytes, str]): + with pytest.raises(ValueError): + _ = stringToSlicing(slice_string) From ad2e9b88e1f6ae66b3e54ffa0be88df9b2af046e Mon Sep 17 00:00:00 2001 From: k-dominik Date: Tue, 16 Apr 2024 14:14:51 +0200 Subject: [PATCH 04/13] Added regex-based classifier type deserialization This avoids using pickle to deserialize the classifier type. --- .../base/appletSerializer/serializerUtils.py | 93 +++++++++++++++++++ .../base/appletSerializer/slotSerializer.py | 9 +- .../test_applets/base/test_serializerUtils.py | 42 +++++++++ 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index ba271e363d..8ae27ae5b5 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -18,6 +18,13 @@ # on the ilastik web site at: # http://ilastik.org/license.html ############################################################################### +import re +from dataclasses import dataclass +from typing import Type, Union + +import h5py + + def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: """Deletes parentGroup[name], if it exists.""" # Check first. If we try to delete a non-existent key, @@ -78,3 +85,89 @@ def stringToSlicing(strSlicing: Union[bytes, str]) -> Tuple[slice, ...]: slicing.append(slice(start, stop)) return tuple(slicing) + + +def deserialize_string_from_h5(ds: h5py.Dataset): + return ds[()].decode() + + +LazyflowClassifierABCs = Union[LazyflowPixelwiseClassifierABC, LazyflowVectorwiseClassifierABC] + +LazyflowClassifierTypeABCs = Union[Type[LazyflowPixelwiseClassifierABC], Type[LazyflowVectorwiseClassifierABC]] + + +_lazyflow_classifier_factory_submodule_allow_list = [ + "vigraRfPixelwiseClassifier", + "vigraRfLazyflowClassifier", + "parallelVigraRfLazyflowClassifier", + "sklearnLazyflowClassifier", +] + + +_lazyflow_classifier_type_allow_list = [ + "VigraRfPixelwiseClassifier", + "VigraRfLazyflowClassifier", + "ParallelVigraRfLazyflowClassifier", + "SklearnLazyflowClassifier", +] + + +@dataclass +class ClassifierInfo: + submodule_name: str + type_name: str + + @property + def classifier_type(self) -> LazyflowClassifierTypeABCs: + submodule = getattr(lazyflow.classifiers, self.submodule_name) + classifier_type = getattr(submodule, self.type_name) + return classifier_type + + +def deserialize_legacy_classifier_type_info(ds: h5py.Dataset) -> ClassifierInfo: + """Legacy helper for classifier type_info deserialization + + in order to avoid unpickling, the protocol0-style pickle string is + parsed to extract the classifier typename of the form + `lazyflow.classifier.myclassifier.MyClassifierType`, e.g. + `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifier`. + + Args: + ds: h5py dataset with that holds the pickled string - usually in + `PixelClassification/ClassifierForests/pickled_type` + + Returns: + Dictionary with two keys: `submodule_name`, and `typename` + + Raises: + ValueError if pickled string does not conform to required pattern + """ + class_string: str = deserialize_string_from_h5(ds) + classifier_pickle_string_matcher = re.compile( + r""" + c # GLOBAL + lazyflow\.classifiers\.(?P\w+) + \n + (?P\w+) + \n + p\d+ + \n + \. # all pickles end in "." STOP + $ + """, + re.X, + ) + + # legacy support - ilastik used to pickle the classifier type + if class_string.isascii() and (m := classifier_pickle_string_matcher.match(class_string)): + groupdict = m.groupdict() + + if groupdict["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: + raise ValueError(f"Could not load classifier: submodule {groupdict['submodule_name']} not allowed.") + + if groupdict["type_name"] not in _lazyflow_classifier_type_allow_list: + raise ValueError(f"Could not load classifier: type {groupdict['type_name']} not allowed.") + + return ClassifierInfo(**groupdict) + + raise ValueError(f"Could not load classifier type {class_string=}") diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 2ddbb83f9b..dbbf4f2f31 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -38,7 +38,12 @@ from lazyflow.utility import timeLogged from . import jsonSerializerRegistry -from .serializerUtils import deleteIfPresent, slicingToString, stringToSlicing +from .serializerUtils import ( + deleteIfPresent, + slicingToString, + stringToSlicing, + deserialize_legacy_classifier_type_info, +) logger = logging.getLogger(__name__) @@ -636,7 +641,7 @@ def deserialize(self, group): def _deserialize(self, classifierGroup, slot): try: - classifier_type = pickle.loads(classifierGroup["pickled_type"][()]) + classifier_type = deserialize_legacy_classifier_type_info(classifierGroup["pickled_type"]).classifier_type except KeyError: # For compatibility with old project files, choose the default classifier. from lazyflow.classifiers import ParallelVigraRfLazyflowClassifier diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index 946b0c00eb..6bb3b4ee27 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -1,12 +1,17 @@ +import pickle from typing import Sequence, Tuple, Union + import h5py import pytest from ilastik.applets.base.appletSerializer.serializerUtils import ( deleteIfPresent, + deserialize_legacy_classifier_type_info, + deserialize_string_from_h5, slicingToString, stringToSlicing, ) +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier def test_deleteIfPresent_present(empty_in_memory_project_file: h5py.File): @@ -99,3 +104,40 @@ def test_stringToSlicing(slice_string: Union[bytes, str], expected_slicing: Tupl def test_stringToSlicing_raises(slice_string: Union[bytes, str]): with pytest.raises(ValueError): _ = stringToSlicing(slice_string) + + +def test_deserialize_string_from_h5(empty_in_memory_project_file: h5py.File): + test_string = "this is a test string" + ds = empty_in_memory_project_file.create_dataset("test", data=test_string.encode("utf-8")) + + assert deserialize_string_from_h5(ds) == test_string + + +def test_deserialize_classifier(empty_in_memory_project_file: h5py.File): + classifier_bytes = b"clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifier\np0\n." + expected_submodule = "vigraRfLazyflowClassifier" + expected_type = "VigraRfLazyflowClassifier" + ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) + + cl_info = deserialize_legacy_classifier_type_info(ds) + + assert cl_info.submodule_name == expected_submodule + assert cl_info.type_name == expected_type + + assert issubclass(cl_info.classifier_type, VigraRfLazyflowClassifier) + + +@pytest.mark.parametrize( + "classifier_bytes", + [ + b"clazyflow.class.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"csome.other_submodule.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"clazyflow.classifiers.sneakyVigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"clazyflow.classifiers.vigraRfLazyflowClassifier\nSneakyVigraRfLazyflowClassifierFactory\np0\n.", + b"random.", + ], +) +def test_deserialize_classifier_raises(empty_in_memory_project_file: h5py.File, classifier_bytes: bytes): + ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) + with pytest.raises(ValueError): + _ = deserialize_legacy_classifier_type_info(ds) From 78d5b95218abc0f06165182f3f19d1b4465b2196 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Tue, 16 Apr 2024 14:22:13 +0200 Subject: [PATCH 05/13] Add regex-based classifier factory type deserialization this avoids the use of pickle for deserializing instances of classifier factories. This commit is a bit more involved, as deserializing the factories involves quite a bit of parsing of the pickled strings. The basic idea is to go in two stages from the pickled strings: 1. Deserialize to an intermediate representation (dataclass). The deserializers make sure this information is already safe, as in no modules/classes outside the spectrum of classifiers already seen in ilastik are allowed. 2. Creation of the actual classifier factory instances. --- .../base/appletSerializer/serializerUtils.py | 485 +++++++++++++++++- .../base/appletSerializer/slotSerializer.py | 4 +- lazyflow/classifiers/lazyflowClassifier.py | 14 +- .../test_applets/base/test_serializerUtils.py | 411 ++++++++++++++- 4 files changed, 903 insertions(+), 11 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index 8ae27ae5b5..28c1c01b41 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -19,10 +19,30 @@ # http://ilastik.org/license.html ############################################################################### import re +from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Type, Union +from typing import Dict, List, Sequence, Tuple, Type, Union import h5py +import sklearn +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC, NuSVC +from sklearn.tree import DecisionTreeClassifier + +import lazyflow.classifiers +from lazyflow.classifiers.lazyflowClassifier import ( + LazyflowPixelwiseClassifierABC, + LazyflowPixelwiseClassifierFactoryABC, + LazyflowVectorwiseClassifierABC, + LazyflowVectorwiseClassifierFactoryABC, +) +from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfPixelwiseClassifier import VigraRfPixelwiseClassifierFactory def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: @@ -103,6 +123,12 @@ def deserialize_string_from_h5(ds: h5py.Dataset): "sklearnLazyflowClassifier", ] +_lazyflow_classifier_factory_type_allow_list = [ + "VigraRfPixelwiseClassifierFactory", + "VigraRfLazyflowClassifierFactory", + "ParallelVigraRfLazyflowClassifierFactory", + "SklearnLazyflowClassifierFactory", +] _lazyflow_classifier_type_allow_list = [ "VigraRfPixelwiseClassifier", @@ -171,3 +197,460 @@ def deserialize_legacy_classifier_type_info(ds: h5py.Dataset) -> ClassifierInfo: return ClassifierInfo(**groupdict) raise ValueError(f"Could not load classifier type {class_string=}") + + +LazyflowClassifierFactoryABCs = Union[LazyflowPixelwiseClassifierFactoryABC, LazyflowVectorwiseClassifierFactoryABC] + +LazyflowClassifierFactoryTypeABCs = Union[ + Type[LazyflowPixelwiseClassifierFactoryABC], Type[LazyflowVectorwiseClassifierFactoryABC] +] + + +@dataclass +class ClassifierFactoryTypeInfo: + factory_submodule: str + factory_typename: str + factory_version: int + + @property + def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABCs: + submod = getattr(lazyflow.classifiers, self.factory_submodule) + classifier_factory_type = getattr(submod, self.factory_typename) + return classifier_factory_type + + +class ClassifierFactoryInfo(ABC): + + @property + @abstractmethod + def instance(self) -> LazyflowClassifierFactoryABCs: ... + + +def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABCs: + pickle_string: str = deserialize_string_from_h5(ds) + clasifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) + + classifier_factory_type = clasifier_factory_info.classifier_factory_type + classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) + return classifier_factory_details.instance + + +def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: + """Legacy helper for classifier type_info deserialization + + in order to avoid unpickling, the protocol0-style pickle string is + parsed to extract the classifier typename of the form + `lazyflow.classifier.myclassifier.MyClassifierTypeFactory`, e.g. + `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifierFactory`. + + Args: + pickle_string: string from pickling a LazyflowClassifierFactory instance + + Returns: + ClassifierFactoryTypeInfo with classifier information + + Raises: + ValueError if pickled string does not conform to required pattern + """ + + classifier_factory_pickle_string_matcher = re.compile( + r""" + clazyflow\.classifiers\.(?P\w+) + \n + (?P\w+) + \n + """, + re.X, + ) + + classifier_factory_version_pickle_string_matcher = re.compile( + r""" + VVERSION\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if pickle_string.isascii() and (m := classifier_factory_pickle_string_matcher.search(pickle_string)): + groupdict = m.groupdict() + submodule = groupdict["factory_submodule"] + typename = groupdict["type_name"] + + if submodule not in _lazyflow_classifier_factory_submodule_allow_list: + raise ValueError(f"Could not load classifier: submodule {submodule} not allowed. {pickle_string=}") + + if typename not in _lazyflow_classifier_factory_type_allow_list: + raise ValueError(f"Could not load classifier factory: type {typename} not allowed.") + else: + raise ValueError(f"Could not load classifier factory type submodule and type not found {pickle_string=}") + + if m := classifier_factory_version_pickle_string_matcher.search(pickle_string): + version = int(m.groupdict()["factory_version"]) + else: + raise ValueError(f"Could not load classifier type, no version found {pickle_string=}") + + return ClassifierFactoryTypeInfo(factory_submodule=submodule, factory_typename=typename, factory_version=version) + + +def _deserialize_classifier_factory_details( + classifier_factory: LazyflowClassifierFactoryTypeABCs, pickle_str: str +) -> ClassifierFactoryInfo: + + if issubclass(classifier_factory, (VigraRfPixelwiseClassifierFactory, VigraRfLazyflowClassifierFactory)): + return _deserialize_legacy_VigraRflassifierFactory(pickle_str) + + if issubclass(classifier_factory, ParallelVigraRfLazyflowClassifierFactory): + return _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_str) + + if issubclass(classifier_factory, SklearnLazyflowClassifierFactory): + return _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_str) + + raise ValueError(f"Don't know how to deserialize classifier of type {classifier_factory!r}") + + +@dataclass +class VigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): + args: List[int] + + @property + def instance(self) -> VigraRfLazyflowClassifierFactory: + return VigraRfLazyflowClassifierFactory(*self.args) + + +def _deserialize_legacy_VigraRflassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: + """ + These classifier factories have only been used with a single arg + """ + classifier_factory_args_pickle_string_matcher = re.compile( + r""" + V_args\n + p\d+\n + (\(I)? + (?P(?<=\(I)\d+)\n # we _only_ expect one integer element in _args for this type + """, + re.X, + ) + + if m := classifier_factory_args_pickle_string_matcher.search(pickle_string): + arg = int(m.groupdict()["arg"]) + else: + raise ValueError( + f"Could not load VigraRfLazyflowClassifierFactory, no argument found not found in {pickle_string=}" + ) + + return VigraRfLazyflowClassifierFactoryInfo(args=[arg]) + + +@dataclass +class ParallelVigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): + num_trees: int + label_proportion: Union[float, None] + variable_importance_path: Union[str, None] + variable_importance_enabled: bool + num_forests: int + + # we don't do kwargs here - there is no evidence that in ilastik + # history kwargs were ever used + # kwargs + @property + def instance(self) -> ParallelVigraRfLazyflowClassifierFactory: + return ParallelVigraRfLazyflowClassifierFactory( + num_trees_total=self.num_trees, + num_forests=self.num_forests, + variable_importance_path=self.variable_importance_path, + label_proportion=self.label_proportion, + variable_importance_enabled=self.variable_importance_enabled, + ) + + +def _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory( + pickle_string, +) -> ParallelVigraRfLazyflowClassifierFactoryInfo: + classifier_factory_num_trees_pickle_string_matcher = re.compile( + r""" + V_num_trees\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if m := classifier_factory_num_trees_pickle_string_matcher.search(pickle_string): + num_trees = int(m.groupdict()["num_trees"]) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_trees not found in {pickle_string=}" + ) + + # this can be None, otherwise float between 0.0 and 1.0 + classifier_factory_label_proportion_pickle_string_matcher = re.compile( + r""" + V_label_proportion\n + p\d+\n + F?(?P((?<=F)[01]\.\d+(?=\n))|N(?=s)) # positive lookbehind for float + """, + re.X, + ) + + if m := classifier_factory_label_proportion_pickle_string_matcher.search(pickle_string): + label_prop_string = m.groupdict()["label_proportion"] + label_proportion = None if label_prop_string == "N" else float(label_prop_string) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _label_proportion not found in {pickle_string=}" + ) + + # this can be None, otherwise string (V) + classifier_factory_variable_importance_path_pickle_string_matcher = re.compile( + r""" + V_variable_importance_path\n + p\d+\n + V?(?P((?<=V).+(?=\n))|N(?=s)) # positive lookbehind for string + """, + re.X, + ) + + if m := classifier_factory_variable_importance_path_pickle_string_matcher.search(pickle_string): + variable_importance_pth_string = m.groupdict()["variable_importance_path"] + variable_importance_path = None if variable_importance_pth_string == "N" else variable_importance_pth_string + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_path not found in {pickle_string=}" + ) + + # will be a bool, either I00, or I01 + classifier_factory_variable_importance_enabled_pickle_string_matcher = re.compile( + r""" + V_variable_importance_enabled\n + p\d+\n + I(?P[01]{2})\n + """, + re.X, + ) + + if m := classifier_factory_variable_importance_enabled_pickle_string_matcher.search(pickle_string): + variable_importance_enabled = bool(int(m.groupdict()["variable_importance_enabled"])) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_enabled not found in {pickle_string=}" + ) + + classifier_factory_num_forests_pickle_string_matcher = re.compile( + r""" + V_num_forests\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if m := classifier_factory_num_forests_pickle_string_matcher.search(pickle_string): + num_forests = int(m.groupdict()["num_forests"]) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_forests not found in {pickle_string=}" + ) + + return ParallelVigraRfLazyflowClassifierFactoryInfo( + num_trees=num_trees, + label_proportion=label_proportion, + variable_importance_path=variable_importance_path, + variable_importance_enabled=variable_importance_enabled, + num_forests=num_forests, + ) + + +SklearnClassifierType = Union[ + Type[AdaBoostClassifier], + Type[DecisionTreeClassifier], + Type[GaussianNB], + Type[KNeighborsClassifier], + Type[LinearDiscriminantAnalysis], + Type[NuSVC], + Type[QuadraticDiscriminantAnalysis], + Type[RandomForestClassifier], + Type[SVC], +] + + +@dataclass +class SklearnClassifierTypeInfo: + submodules: List[str] + typename: str + + @property + def classifier_type(self) -> SklearnClassifierType: + submodule = getattr(sklearn, self.submodules[0]) + for sm_name in self.submodules[1:]: + submodule = getattr(submodule, sm_name) + + classifier_type = getattr(submodule, self.typename) + return classifier_type + + +@dataclass +class SklearnClassifierFactoryInfo(ClassifierFactoryInfo): + classifier_type: SklearnClassifierType + args: List[int] + kwargs: Dict[str, Union[bool, int, float]] + + @property + def instance(self) -> LazyflowClassifierFactoryABCs: + return SklearnLazyflowClassifierFactory(self.classifier_type, *self.args, **self.kwargs) + + +def _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClassifierFactoryInfo: + """ + _args : RandomForestClassifier, 100 | GaussianNB | AdaBoostClassifier | DecisionTreeClassifier | KNeighborsClassifier | LDA | QDA | SVC | NuSVC + _kwargs NONE | NONE | n_estimators=100 | max_depth=5 | NONE | N NONE | NONE | probability=True | probability=True + _classifier_type + + """ + classifier_factory_sklearn_type_pickle_string_matcher = re.compile( + """ + V_classifier_type\\n + p\\d+\\n + c + sklearn\\.(?P[\\w+\\.]+)\\n + (?P[\\w]+)\\n + """, + re.X, + ) + + sklearn_submodule_allow_list = [ + "discriminant_analysis", + "ensemble._forest", + "ensemble._weight_boosting", + "naive_bayes", + "neighbors._classification", + "svm._classes", + "tree._classes", + ] + + sklearn_classifier_allow_list = [ + "AdaBoostClassifier", + "DecisionTreeClassifier", + "GaussianNB", + "KNeighborsClassifier", + "LinearDiscriminantAnalysis", + "NuSVC", + "QuadraticDiscriminantAnalysis", + "RandomForestClassifier", + "SVC", + ] + + if m := classifier_factory_sklearn_type_pickle_string_matcher.search(pickle_string): + groupdict = m.groupdict() + submodules = groupdict["submodules"] + typename = groupdict["typename"] + + if submodules not in sklearn_submodule_allow_list or typename not in sklearn_classifier_allow_list: + raise ValueError(f"Classifier of type sklearn.{submodules}.{typename} not permitted.") + + else: + raise ValueError(f"Could not load classifier type {pickle_string=}") + + classifier_info = SklearnClassifierTypeInfo(submodules=submodules.split("."), typename=typename) + classifier_type = classifier_info.classifier_type + + return _deserialize_sklearn_classifier_details(classifier_type, pickle_string) + + +def _deserialize_sklearn_classifier_details( + classifier_type: SklearnClassifierType, pickle_str: str +) -> SklearnClassifierFactoryInfo: + if issubclass(classifier_type, RandomForestClassifier): + return _deserialize_sklearn_RandomForest_details(pickle_str) + + if issubclass(classifier_type, AdaBoostClassifier): + return _deserialize_sklearn_AdaBoostClassifier_details(pickle_str) + + if issubclass(classifier_type, DecisionTreeClassifier): + return _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str) + + if issubclass(classifier_type, (SVC, NuSVC)): + return _deserialize_sklearn_SVC_details(pickle_str, classifier_type) + + if issubclass( + classifier_type, + ( + GaussianNB, + KNeighborsClassifier, + LinearDiscriminantAnalysis, + QuadraticDiscriminantAnalysis, + ), + ): + return SklearnClassifierFactoryInfo(classifier_type=classifier_type, args=[], kwargs={}) + + +def _deserialize_sklearn_RandomForest_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_args_pickle_string_matcher = re.compile( + r""" + V_args\n + p\d+\n + \( + I(?P\d+)\n # we _only_ expect one integer element in _args for this type + """, + re.X, + ) + + if m := classifier_factory_args_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=RandomForestClassifier, args=[int(m.groupdict()["arg"])], kwargs={} + ) + else: + raise ValueError("Could not deserialize sklearn RandomForest classifier.") + + +def _deserialize_sklearn_AdaBoostClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_n_estimators_pickle_string_matcher = re.compile( + r""" + Vn_estimators\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + if m := classifier_factory_n_estimators_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": int(m.groupdict()["n_estimators"])} + ) + else: + raise ValueError("Could not deserialize sklearn AdaBoostClassifier.") + + +def _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_max_depth_pickle_string_matcher = re.compile( + r""" + Vmax_depth\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + if m := classifier_factory_max_depth_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": int(m.groupdict()["max_depth"])} + ) + else: + raise ValueError("Could not deserialize sklearn DecisionTreeClassifier") + + +def _deserialize_sklearn_SVC_details( + pickle_str: str, classifier_type: Union[Type[SVC], Type[NuSVC]] +) -> SklearnClassifierFactoryInfo: + classifier_factory_probability_pickle_string_matcher = re.compile( + r""" + Vprobability\n + p\d+\n + I(?P[01]{2})\n + """, + re.X, + ) + if m := classifier_factory_probability_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=classifier_type, args=[], kwargs={"probability": int(m.groupdict()["probability"]) != 0} + ) + else: + raise ValueError("Could not deserialize sklearn SVC/NuSVC classifier.") diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index dbbf4f2f31..6f35e8da74 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -842,10 +842,8 @@ def shouldSerialize(self, group): return super(SerialClassifierFactorySlot, self).shouldSerialize(group) def _getValue(self, dset, slot): - pickled = dset[()] try: - # Attempt to unpickle - value = pickle.loads(pickled) + value = deserialize_legacy_classifier_factory(dset) # Verify that the VERSION of the classifier factory in the currently executing code # has not changed since this classifier was stored. diff --git a/lazyflow/classifiers/lazyflowClassifier.py b/lazyflow/classifiers/lazyflowClassifier.py index 103cfcffdf..a9814e6511 100644 --- a/lazyflow/classifiers/lazyflowClassifier.py +++ b/lazyflow/classifiers/lazyflowClassifier.py @@ -1,6 +1,4 @@ -from builtins import object import abc -from future.utils import with_metaclass def _has_attribute(cls, attr): @@ -11,13 +9,15 @@ def _has_attributes(cls, attrs): return all(_has_attribute(cls, a) for a in attrs) -class LazyflowVectorwiseClassifierFactoryABC(with_metaclass(abc.ABCMeta, object)): +class LazyflowVectorwiseClassifierFactoryABC(abc.ABC): """ Defines an interface for vector-wise classifier 'factory' objects, which lazyflow classifier operators use to construct new vector-wise classifiers. A "vector-wise" classifier is trained with a 2D feature matrix and a 1D label vector. """ + VERSION: int + def __new__(cls, *args, **kwargs): # Force the VERSION class member to be copied to an instance member. obj = object.__new__(cls) @@ -61,7 +61,7 @@ def __subclasshook__(cls, C): return NotImplemented -class LazyflowVectorwiseClassifierABC(with_metaclass(abc.ABCMeta, object)): +class LazyflowVectorwiseClassifierABC(abc.ABC): """ Defines an interface for "vector-wise" classifier objects that can be used by the lazyflow classifier operators. A "vector-wise" classifier is trained with a 2D feature matrix and a 1D label vector. @@ -122,7 +122,7 @@ def deserialize_hdf5(cls, h5py_group): raise NotImplementedError -class LazyflowPixelwiseClassifierFactoryABC(with_metaclass(abc.ABCMeta, object)): +class LazyflowPixelwiseClassifierFactoryABC(abc.ABC): """ Defines an interface for pixel-wise classifier 'factory' objects, which lazyflow classifier operators use to construct new pixel-wise classifiers. @@ -132,6 +132,8 @@ class LazyflowPixelwiseClassifierFactoryABC(with_metaclass(abc.ABCMeta, object)) Note: It is assumed here that 'channel' is always the last axis of the image. """ + VERSION: int + def __new__(cls, *args, **kwargs): # Force the VERSION class member to be copied to an instance member. obj = object.__new__(cls) @@ -202,7 +204,7 @@ def __ne__(self, other): raise NotImplementedError -class LazyflowPixelwiseClassifierABC(with_metaclass(abc.ABCMeta, object)): +class LazyflowPixelwiseClassifierABC(abc.ABC): """ Defines an interface for "pixel-wise" classifier objects that can be used by the lazyflow classifier operators. A "pixel-wise" classifier expects its input be given as a list of ND feature images (with M feature channels). diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index 6bb3b4ee27..b54ac2361b 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -3,15 +3,32 @@ import h5py import pytest +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC, NuSVC +from sklearn.tree import DecisionTreeClassifier from ilastik.applets.base.appletSerializer.serializerUtils import ( + ClassifierFactoryTypeInfo, + SklearnClassifierFactoryInfo, + _deserialize_classifier_factory_details, + _deserialize_legacy_classifier_factory_type_info, + _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory, + _deserialize_legacy_SklearnLazyflowClassifierFactory, + _deserialize_legacy_VigraRflassifierFactory, + _deserialize_sklearn_classifier_details, deleteIfPresent, + deserialize_legacy_classifier_factory, deserialize_legacy_classifier_type_info, deserialize_string_from_h5, slicingToString, stringToSlicing, ) -from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier +from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier, VigraRfLazyflowClassifierFactory def test_deleteIfPresent_present(empty_in_memory_project_file: h5py.File): @@ -141,3 +158,395 @@ def test_deserialize_classifier_raises(empty_in_memory_project_file: h5py.File, ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) with pytest.raises(ValueError): _ = deserialize_legacy_classifier_type_info(ds) + + +@pytest.mark.parametrize( + "classifier_type, c_args, c_kwargs, expected_info", + [ + ( + AdaBoostClassifier, + [], + {"n_estimators": 257}, + SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), + ), + ( + DecisionTreeClassifier, + [], + {"max_depth": 257}, + SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), + ), + (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), + ( + KNeighborsClassifier, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), + ), + ( + LinearDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + QuadraticDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + RandomForestClassifier, + [143], + {}, + SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), + ), + ( + SVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), + ), + ( + NuSVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), + ), + ], +) +def test_sklearn_lazyflow_classifier_pickled_deserialization( + classifier_type, c_args, c_kwargs, expected_info: SklearnClassifierFactoryInfo +): + pickled_classifier = pickle.dumps( + SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 + ).decode() + deserialized_info = _deserialize_sklearn_classifier_details(classifier_type, pickled_classifier) + assert deserialized_info == expected_info + + +@pytest.mark.parametrize( + "classifier_type", + [ + AdaBoostClassifier, + DecisionTreeClassifier, + RandomForestClassifier, + SVC, + NuSVC, + ], +) +def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( + classifier_type, +): + with pytest.raises(ValueError, match="Could not deserialize"): + _ = _deserialize_sklearn_classifier_details(classifier_type, "someRandomString") + + +@pytest.mark.parametrize( + "classifier_type, c_args, c_kwargs, expected_info", + [ + ( + AdaBoostClassifier, + [], + {"n_estimators": 257}, + SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), + ), + ( + DecisionTreeClassifier, + [], + {"max_depth": 257}, + SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), + ), + (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), + ( + KNeighborsClassifier, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), + ), + ( + LinearDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + QuadraticDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + RandomForestClassifier, + [143], + {}, + SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), + ), + ( + SVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), + ), + ( + NuSVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), + ), + ], +) +def test_deserialize_legacy_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): + assert True + pickled_classifier = pickle.dumps( + SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 + ).decode() + classifier_factory_info = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickled_classifier) + + assert classifier_factory_info == expected_info + + +@pytest.mark.parametrize( + "pickle_string", + [ + "V_classifier_type\np0\ncsklearn.some.submodules.not.in.list\nAdaBoostClassifier\n", + "V_classifier_type\np0\ncsklearn.neighbors._classification\nMyMeanClassifier42\n", + "someRandomString", + ], +) +def test_deserialize_legacy_SklearnLazyflowClassifierFactory_raises( + pickle_string, +): + with pytest.raises(ValueError): + _ = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(42, None, None, None, False), + ParallelVigraRfLazyflowClassifierFactory(43, 2, None, None, False), + ParallelVigraRfLazyflowClassifierFactory(44, None, "test_variable_importance_path", None, True), + ParallelVigraRfLazyflowClassifierFactory(45, None, None, 0.33, False), + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + ], +) +def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(classifier_factory): + info = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + + assert info.instance == classifier_factory + + +@pytest.mark.parametrize( + "pickle_string", + [ + "something_funny", + # wrong value types will raise: + "V_num_trees\np7\nF46\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nI300\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nF3.0\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI1\ns", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI01\nsV_num_forests\np14\nF89\n", + # missing values will raise: + "V_num_trees\np7\nF46\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\n\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\ns", + ], +) +def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pickle_string): + with pytest.raises(ValueError): + _ = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + VigraRfLazyflowClassifierFactory(30), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_legacy_VigraRflassifierFactory(classifier_factory): + info = _deserialize_legacy_VigraRflassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + + assert info.instance == classifier_factory + + +def test_deserialize_legacy_VigraRflassifierFactory_raises(): + with pytest.raises(ValueError): + _ = _deserialize_legacy_VigraRflassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_classifier_factory_details(classifier_factory): + info = _deserialize_classifier_factory_details( + type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() + ) + + assert info.instance == classifier_factory + + +class MyTestClassifierFactory: + def __init__(self, *args, **kwargs): + pass + + +def test_deserialize_classifier_factory_details_raises(): + + classifier_factory = MyTestClassifierFactory(42) + + with pytest.raises(ValueError): + _ = _deserialize_classifier_factory_details( + type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() # type: ignore[reportArgumentType] + ) + + +@pytest.mark.parametrize( + "classifier_factory, expected_info", + [ + ( + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + ClassifierFactoryTypeInfo( + factory_submodule="parallelVigraRfLazyflowClassifier", + factory_typename="ParallelVigraRfLazyflowClassifierFactory", + factory_version=ParallelVigraRfLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + VigraRfLazyflowClassifierFactory(42), + ClassifierFactoryTypeInfo( + factory_submodule="vigraRfLazyflowClassifier", + factory_typename="VigraRfLazyflowClassifierFactory", + factory_version=VigraRfLazyflowClassifierFactory.VERSION, + ), + ), + ], +) +def test_deserialize_legacy_classifier_factory_type_info(classifier_factory, expected_info): + info = _deserialize_legacy_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) + assert info == expected_info + + +@pytest.mark.parametrize( + "pickle_string", + [ + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nMyPhantasyFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.someothermodule\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nsV_args", + ], +) +def test_deserialize_legacy_classifier_factory_type_info_raises(pickle_string): + with pytest.raises(ValueError): + _ = _deserialize_legacy_classifier_factory_type_info(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_legacy_classifier_factory(empty_in_memory_project_file, classifier_factory): + + ds = empty_in_memory_project_file.create_dataset(name="classifier", data=pickle.dumps(classifier_factory, 0)) + + factory = deserialize_legacy_classifier_factory(ds) + + assert factory == classifier_factory From 82967632300b40ceacd4e38c280e538b3d03e7f4 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 19 Apr 2024 15:27:09 +0200 Subject: [PATCH 06/13] review suggestions * Use singular for type aliases (e.g LazyflowClassifierABCs -> LazyflowClassifierABC) * `is` comparison to `None` instead of `==` * Fix some typos * More accurate comments Co-authored-by: Benedikt Best <63287233+btbest@users.noreply.github.com> --- .../base/appletSerializer/serializerUtils.py | 37 +++++++++---------- .../base/appletSerializer/slotSerializer.py | 1 + .../test_applets/base/test_serializerUtils.py | 6 +-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index 28c1c01b41..5d414fc8e6 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -65,10 +65,10 @@ def slicingToString(slicing: Sequence[slice]) -> bytes: if any(sl.step not in [None, 1] for sl in slicing): raise ValueError("Only slices with step size of `1` or `None` are supported.") - if any(sl.start == None for sl in slicing): + if any(sl.start is None for sl in slicing): raise ValueError("Start indices for slicing must be integer, got `None`.") - if any(sl.stop == None for sl in slicing): + if any(sl.stop is None for sl in slicing): raise ValueError("Stop indices for slicing must be integer, got `None`.") strSlicing = "[" @@ -111,9 +111,9 @@ def deserialize_string_from_h5(ds: h5py.Dataset): return ds[()].decode() -LazyflowClassifierABCs = Union[LazyflowPixelwiseClassifierABC, LazyflowVectorwiseClassifierABC] +LazyflowClassifierABC = Union[LazyflowPixelwiseClassifierABC, LazyflowVectorwiseClassifierABC] -LazyflowClassifierTypeABCs = Union[Type[LazyflowPixelwiseClassifierABC], Type[LazyflowVectorwiseClassifierABC]] +LazyflowClassifierTypeABC = Union[Type[LazyflowPixelwiseClassifierABC], Type[LazyflowVectorwiseClassifierABC]] _lazyflow_classifier_factory_submodule_allow_list = [ @@ -144,7 +144,7 @@ class ClassifierInfo: type_name: str @property - def classifier_type(self) -> LazyflowClassifierTypeABCs: + def classifier_type(self) -> LazyflowClassifierTypeABC: submodule = getattr(lazyflow.classifiers, self.submodule_name) classifier_type = getattr(submodule, self.type_name) return classifier_type @@ -199,9 +199,9 @@ def deserialize_legacy_classifier_type_info(ds: h5py.Dataset) -> ClassifierInfo: raise ValueError(f"Could not load classifier type {class_string=}") -LazyflowClassifierFactoryABCs = Union[LazyflowPixelwiseClassifierFactoryABC, LazyflowVectorwiseClassifierFactoryABC] +LazyflowClassifierFactoryABC = Union[LazyflowPixelwiseClassifierFactoryABC, LazyflowVectorwiseClassifierFactoryABC] -LazyflowClassifierFactoryTypeABCs = Union[ +LazyflowClassifierFactoryTypeABC = Union[ Type[LazyflowPixelwiseClassifierFactoryABC], Type[LazyflowVectorwiseClassifierFactoryABC] ] @@ -213,7 +213,7 @@ class ClassifierFactoryTypeInfo: factory_version: int @property - def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABCs: + def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABC: submod = getattr(lazyflow.classifiers, self.factory_submodule) classifier_factory_type = getattr(submod, self.factory_typename) return classifier_factory_type @@ -223,14 +223,14 @@ class ClassifierFactoryInfo(ABC): @property @abstractmethod - def instance(self) -> LazyflowClassifierFactoryABCs: ... + def instance(self) -> LazyflowClassifierFactoryABC: ... -def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABCs: +def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: pickle_string: str = deserialize_string_from_h5(ds) - clasifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) + classifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) - classifier_factory_type = clasifier_factory_info.classifier_factory_type + classifier_factory_type = classifier_factory_info.classifier_factory_type classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) return classifier_factory_details.instance @@ -294,11 +294,11 @@ def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> Clas def _deserialize_classifier_factory_details( - classifier_factory: LazyflowClassifierFactoryTypeABCs, pickle_str: str + classifier_factory: LazyflowClassifierFactoryTypeABC, pickle_str: str ) -> ClassifierFactoryInfo: if issubclass(classifier_factory, (VigraRfPixelwiseClassifierFactory, VigraRfLazyflowClassifierFactory)): - return _deserialize_legacy_VigraRflassifierFactory(pickle_str) + return _deserialize_legacy_VigraRfClassifierFactory(pickle_str) if issubclass(classifier_factory, ParallelVigraRfLazyflowClassifierFactory): return _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_str) @@ -318,7 +318,7 @@ def instance(self) -> VigraRfLazyflowClassifierFactory: return VigraRfLazyflowClassifierFactory(*self.args) -def _deserialize_legacy_VigraRflassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: +def _deserialize_legacy_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: """ These classifier factories have only been used with a single arg """ @@ -350,9 +350,8 @@ class ParallelVigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): variable_importance_enabled: bool num_forests: int - # we don't do kwargs here - there is no evidence that in ilastik - # history kwargs were ever used - # kwargs + # ParallelVigraRfLazyflowClassifierFactory accepts additional kwargs, but we cannot deserialize arbitrary input. + # The parameters listed are all that we ever used in ilastik history. @property def instance(self) -> ParallelVigraRfLazyflowClassifierFactory: return ParallelVigraRfLazyflowClassifierFactory( @@ -496,7 +495,7 @@ class SklearnClassifierFactoryInfo(ClassifierFactoryInfo): kwargs: Dict[str, Union[bool, int, float]] @property - def instance(self) -> LazyflowClassifierFactoryABCs: + def instance(self) -> LazyflowClassifierFactoryABC: return SklearnLazyflowClassifierFactory(self.classifier_type, *self.args, **self.kwargs) diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 6f35e8da74..d9bab5b92e 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -43,6 +43,7 @@ slicingToString, stringToSlicing, deserialize_legacy_classifier_type_info, + deserialize_legacy_classifier_factory, ) logger = logging.getLogger(__name__) diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index b54ac2361b..b870c9f556 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -17,7 +17,7 @@ _deserialize_legacy_classifier_factory_type_info, _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory, _deserialize_legacy_SklearnLazyflowClassifierFactory, - _deserialize_legacy_VigraRflassifierFactory, + _deserialize_legacy_VigraRfClassifierFactory, _deserialize_sklearn_classifier_details, deleteIfPresent, deserialize_legacy_classifier_factory, @@ -366,14 +366,14 @@ def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pick ], ) def test_deserialize_legacy_VigraRflassifierFactory(classifier_factory): - info = _deserialize_legacy_VigraRflassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + info = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) assert info.instance == classifier_factory def test_deserialize_legacy_VigraRflassifierFactory_raises(): with pytest.raises(ValueError): - _ = _deserialize_legacy_VigraRflassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) + _ = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) @pytest.mark.parametrize( From 91a7f1c81135ff0424dc2cfcb828f5798ef23083 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 09:53:11 +0200 Subject: [PATCH 07/13] ensure classifier VERSION is correctly deserialized --- ilastik/applets/base/appletSerializer/serializerUtils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index 5d414fc8e6..bd50c2382a 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -232,7 +232,9 @@ def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifie classifier_factory_type = classifier_factory_info.classifier_factory_type classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) - return classifier_factory_details.instance + factory_instance = classifier_factory_details.instance + factory_instance.VERSION = classifier_factory_info.factory_version + return factory_instance def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: From b419ab2a310b01d2979933c101406ce2d7cbfa45 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 10:04:33 +0200 Subject: [PATCH 08/13] finer grained exception handling if factory cannot be deserialized Added test for SerialClassifierFactorySlot --- .../base/appletSerializer/slotSerializer.py | 22 ++-- .../test_applets/base/testSerializer.py | 113 +++++++++++++++--- 2 files changed, 111 insertions(+), 24 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index d9bab5b92e..415197b754 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -846,17 +846,23 @@ def _getValue(self, dset, slot): try: value = deserialize_legacy_classifier_factory(dset) - # Verify that the VERSION of the classifier factory in the currently executing code - # has not changed since this classifier was stored. - assert "VERSION" in value.__dict__ and value.VERSION == type(value).VERSION - except: + except ValueError: self._failed_to_deserialize = True warnings.warn( - "This project file uses an old or unsupported classifier storage format. " - "The classifier will be stored in the new format when you save your project." + "This project file uses an old or unsupported classifier-factory storage format. " + "The classifier-factory will be stored in the new format when you save your project." ) - else: - slot.setValue(value) + return + + # Verify that the VERSION of the classifier factory in the currently executing code + # has not changed since this classifier was stored. + if not hasattr(value, "VERSION") or value.VERSION != type(value).VERSION: + warnings.warn( + "This project file uses an old or unsupported classifier-factory storage format. " + "When retraining, the default classifier-factory will be used." + ) + return + slot.setValue(value) class SerialPickleableSlot(SerialSlot): diff --git a/tests/test_ilastik/test_applets/base/testSerializer.py b/tests/test_ilastik/test_applets/base/testSerializer.py index 338a3022f5..8225acc48d 100644 --- a/tests/test_ilastik/test_applets/base/testSerializer.py +++ b/tests/test_ilastik/test_applets/base/testSerializer.py @@ -1,7 +1,7 @@ ############################################################################### # ilastik: interactive learning and segmentation toolkit # -# Copyright (C) 2011-2014, the ilastik developers +# Copyright (C) 2011-2024, the ilastik developers # # # This program is free software; you can redistribute it and/or @@ -19,29 +19,42 @@ # http://ilastik.org/license.html ############################################################################### import os -import h5py -import numpy -import vigra -import unittest +import pickle import shutil import tempfile -import pytest +import unittest from copy import deepcopy -from ilastik.applets.base.appletSerializer import SerialObjectFeatureNamesSlot -from lazyflow.graph import Graph, Operator, InputSlot, Slot, OperatorWrapper -from lazyflow.operators import OpCompressedUserLabelArray -from lazyflow.stype import Opaque -from lazyflow.rtype import List -from ilastik.applets.base.appletSerializer import jsonSerializerRegistry +import h5py +import numpy +import pytest +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC, NuSVC +from sklearn.tree import DecisionTreeClassifier +import vigra + from ilastik.applets.base.appletSerializer import ( - SerialSlot, - SerialListSlot, AppletSerializer, - SerialDictSlot, - SerialBlockSlot, JSONSerialSlot, + SerialBlockSlot, + SerialDictSlot, + SerialListSlot, + SerialObjectFeatureNamesSlot, + SerialSlot, + jsonSerializerRegistry, ) +from ilastik.applets.base.appletSerializer.slotSerializer import SerialClassifierFactorySlot +from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifierFactory +from lazyflow.graph import Graph, InputSlot, Operator, OperatorWrapper, Slot +from lazyflow.operators import OpCompressedUserLabelArray +from lazyflow.rtype import List +from lazyflow.slot import OutputSlot +from lazyflow.stype import Opaque class OpMock(Operator): @@ -861,3 +874,71 @@ def test_deserializing_no_value(self, operator, registry, tmpdir, serializer): group = f.create_group("test") slot.deserialize(group) assert not operator.TestSlot.ready() + + +class TestSerialClassifierFactorySlot: + default_factory = VigraRfLazyflowClassifierFactory(48) + + @pytest.fixture + def operator(self, graph): + class OpPassThrough(Operator): + ClassifierFactory = InputSlot(stype=Opaque, value=self.default_factory) + Out = OutputSlot() + + def setupOutputs(self): + self.Out.meta.assignFrom(self.ClassifierFactory.meta) + + def execute(self, slot, subindex, roi, result): + result[:] = self.ClassifierFactory[:].wait() + + def propagateDirty(self, *args, **kwargs): + pass + + op = OpPassThrough(graph=graph) + op.name = "classifier" + return op + + @pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + VigraRfLazyflowClassifierFactory(100), + ], + ) + def test_deserialization(self, operator, empty_in_memory_project_file, classifier_factory): + serializer = SerialClassifierFactorySlot(operator.ClassifierFactory) + g = empty_in_memory_project_file.create_group("classifier") + g.create_dataset("ClassifierFactory", data=pickle.dumps(classifier_factory, 0)) + serializer.deserialize(g) + + assert operator.Out.value == classifier_factory + + def test_deserialization_skip_on_exc(self, operator, empty_in_memory_project_file): + serializer = SerialClassifierFactorySlot(operator.ClassifierFactory) + g = empty_in_memory_project_file.create_group("classifier") + g.create_dataset("ClassifierFactory", data=b"some random string") + with pytest.warns(UserWarning): + serializer.deserialize(g) + + assert operator.Out.value == self.default_factory + + def test_deserialization_skip_on_version_mismatch(self, operator, empty_in_memory_project_file): + serializer = SerialClassifierFactorySlot(operator.ClassifierFactory) + g = empty_in_memory_project_file.create_group("classifier") + classifier_factory = VigraRfLazyflowClassifierFactory(100) + classifier_factory.VERSION += 1 + g.create_dataset("ClassifierFactory", data=pickle.dumps(classifier_factory, 0)) + + with pytest.warns(UserWarning): + serializer.deserialize(g) + + assert operator.Out.value == self.default_factory From b15b22cbbc281bf4ff43d733e85f3645e954e3eb Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 14:20:59 +0200 Subject: [PATCH 09/13] refactor: reorganize modules move legacy deserialization to own submodule --- .../appletSerializer/legacyClassifiers.py | 592 ++++++++++++++++++ .../base/appletSerializer/serializerUtils.py | 570 +---------------- .../base/appletSerializer/slotSerializer.py | 20 +- .../base/test_legacyClassifierDeserialiers.py | 463 ++++++++++++++ .../test_applets/base/test_serializerUtils.py | 462 +------------- 5 files changed, 1091 insertions(+), 1016 deletions(-) create mode 100644 ilastik/applets/base/appletSerializer/legacyClassifiers.py create mode 100644 tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py diff --git a/ilastik/applets/base/appletSerializer/legacyClassifiers.py b/ilastik/applets/base/appletSerializer/legacyClassifiers.py new file mode 100644 index 0000000000..b8ff632f61 --- /dev/null +++ b/ilastik/applets/base/appletSerializer/legacyClassifiers.py @@ -0,0 +1,592 @@ +############################################################################### +# ilastik: interactive learning and segmentation toolkit +# +# Copyright (C) 2011-2024, the ilastik developers +# +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# In addition, as a special exception, the copyright holders of +# ilastik give you permission to combine ilastik with applets, +# workflows and plugins which are not covered under the GNU +# General Public License. +# +# See the LICENSE file for details. License information is also available +# on the ilastik web site at: +# http://ilastik.org/license.html +############################################################################### +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, List, Type, Union + +import h5py +import sklearn +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC, NuSVC +from sklearn.tree import DecisionTreeClassifier + +import lazyflow.classifiers +from lazyflow.classifiers.lazyflowClassifier import ( + LazyflowPixelwiseClassifierABC, + LazyflowPixelwiseClassifierFactoryABC, + LazyflowVectorwiseClassifierABC, + LazyflowVectorwiseClassifierFactoryABC, +) +from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfPixelwiseClassifier import VigraRfPixelwiseClassifierFactory + +from .serializerUtils import deserialize_string_from_h5 + +LazyflowClassifierABC = Union[LazyflowPixelwiseClassifierABC, LazyflowVectorwiseClassifierABC] + +LazyflowClassifierTypeABC = Union[Type[LazyflowPixelwiseClassifierABC], Type[LazyflowVectorwiseClassifierABC]] + + +_lazyflow_classifier_factory_submodule_allow_list = [ + "vigraRfPixelwiseClassifier", + "vigraRfLazyflowClassifier", + "parallelVigraRfLazyflowClassifier", + "sklearnLazyflowClassifier", +] + +_lazyflow_classifier_factory_type_allow_list = [ + "VigraRfPixelwiseClassifierFactory", + "VigraRfLazyflowClassifierFactory", + "ParallelVigraRfLazyflowClassifierFactory", + "SklearnLazyflowClassifierFactory", +] + +_lazyflow_classifier_type_allow_list = [ + "VigraRfPixelwiseClassifier", + "VigraRfLazyflowClassifier", + "ParallelVigraRfLazyflowClassifier", + "SklearnLazyflowClassifier", +] + + +@dataclass +class ClassifierInfo: + submodule_name: str + type_name: str + + @property + def classifier_type(self) -> LazyflowClassifierTypeABC: + submodule = getattr(lazyflow.classifiers, self.submodule_name) + classifier_type = getattr(submodule, self.type_name) + return classifier_type + + +def deserialize_legacy_classifier_type(ds: h5py.Dataset) -> LazyflowClassifierTypeABC: + """Legacy helper for classifier type_info deserialization + + in order to avoid unpickling, the protocol0-style pickle string is + parsed to extract the classifier typename of the form + `lazyflow.classifier.myclassifier.MyClassifierType`, e.g. + `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifier`. + + Args: + ds: h5py dataset with that holds the pickled string - usually in + `PixelClassification/ClassifierForests/pickled_type` + + Returns: + Dictionary with two keys: `submodule_name`, and `typename` + + Raises: + ValueError if pickled string does not conform to required pattern + """ + class_string: str = deserialize_string_from_h5(ds) + classifier_pickle_string_matcher = re.compile( + r""" + c # GLOBAL + lazyflow\.classifiers\.(?P\w+) + \n + (?P\w+) + \n + p\d+ + \n + \. # all pickles end in "." STOP + $ + """, + re.X, + ) + + # legacy support - ilastik used to pickle the classifier type + if class_string.isascii() and (m := classifier_pickle_string_matcher.match(class_string)): + groupdict = m.groupdict() + + if groupdict["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: + raise ValueError(f"Could not load classifier: submodule {groupdict['submodule_name']} not allowed.") + + if groupdict["type_name"] not in _lazyflow_classifier_type_allow_list: + raise ValueError(f"Could not load classifier: type {groupdict['type_name']} not allowed.") + + return ClassifierInfo(**groupdict).classifier_type + + raise ValueError(f"Could not load classifier type {class_string=}") + + +LazyflowClassifierFactoryABC = Union[LazyflowPixelwiseClassifierFactoryABC, LazyflowVectorwiseClassifierFactoryABC] + +LazyflowClassifierFactoryTypeABC = Union[ + Type[LazyflowPixelwiseClassifierFactoryABC], Type[LazyflowVectorwiseClassifierFactoryABC] +] + + +@dataclass +class ClassifierFactoryTypeInfo: + factory_submodule: str + factory_typename: str + factory_version: int + + @property + def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABC: + submod = getattr(lazyflow.classifiers, self.factory_submodule) + classifier_factory_type = getattr(submod, self.factory_typename) + return classifier_factory_type + + +class ClassifierFactoryInfo(ABC): + + @property + @abstractmethod + def instance(self) -> LazyflowClassifierFactoryABC: ... + + +def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: + pickle_string: str = deserialize_string_from_h5(ds) + classifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) + + classifier_factory_type = classifier_factory_info.classifier_factory_type + classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) + factory_instance = classifier_factory_details.instance + factory_instance.VERSION = classifier_factory_info.factory_version + return factory_instance + + +def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: + """Legacy helper for classifier type_info deserialization + + in order to avoid unpickling, the protocol0-style pickle string is + parsed to extract the classifier typename of the form + `lazyflow.classifier.myclassifier.MyClassifierTypeFactory`, e.g. + `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifierFactory`. + + Args: + pickle_string: string from pickling a LazyflowClassifierFactory instance + + Returns: + ClassifierFactoryTypeInfo with classifier information + + Raises: + ValueError if pickled string does not conform to required pattern + """ + + classifier_factory_pickle_string_matcher = re.compile( + r""" + clazyflow\.classifiers\.(?P\w+) + \n + (?P\w+) + \n + """, + re.X, + ) + + classifier_factory_version_pickle_string_matcher = re.compile( + r""" + VVERSION\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if pickle_string.isascii() and (m := classifier_factory_pickle_string_matcher.search(pickle_string)): + groupdict = m.groupdict() + submodule = groupdict["factory_submodule"] + typename = groupdict["type_name"] + + if submodule not in _lazyflow_classifier_factory_submodule_allow_list: + raise ValueError(f"Could not load classifier: submodule {submodule} not allowed. {pickle_string=}") + + if typename not in _lazyflow_classifier_factory_type_allow_list: + raise ValueError(f"Could not load classifier factory: type {typename} not allowed.") + else: + raise ValueError(f"Could not load classifier factory type submodule and type not found {pickle_string=}") + + if m := classifier_factory_version_pickle_string_matcher.search(pickle_string): + version = int(m.groupdict()["factory_version"]) + else: + raise ValueError(f"Could not load classifier type, no version found {pickle_string=}") + + return ClassifierFactoryTypeInfo(factory_submodule=submodule, factory_typename=typename, factory_version=version) + + +def _deserialize_classifier_factory_details( + classifier_factory: LazyflowClassifierFactoryTypeABC, pickle_str: str +) -> ClassifierFactoryInfo: + + if issubclass(classifier_factory, (VigraRfPixelwiseClassifierFactory, VigraRfLazyflowClassifierFactory)): + return _deserialize_legacy_VigraRfClassifierFactory(pickle_str) + + if issubclass(classifier_factory, ParallelVigraRfLazyflowClassifierFactory): + return _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_str) + + if issubclass(classifier_factory, SklearnLazyflowClassifierFactory): + return _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_str) + + raise ValueError(f"Don't know how to deserialize classifier of type {classifier_factory!r}") + + +@dataclass +class VigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): + args: List[int] + + @property + def instance(self) -> VigraRfLazyflowClassifierFactory: + return VigraRfLazyflowClassifierFactory(*self.args) + + +def _deserialize_legacy_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: + """ + These classifier factories have only been used with a single arg + """ + classifier_factory_args_pickle_string_matcher = re.compile( + r""" + V_args\n + p\d+\n + (\(I)? + (?P(?<=\(I)\d+)\n # we _only_ expect one integer element in _args for this type + """, + re.X, + ) + + if m := classifier_factory_args_pickle_string_matcher.search(pickle_string): + arg = int(m.groupdict()["arg"]) + else: + raise ValueError( + f"Could not load VigraRfLazyflowClassifierFactory, no argument found not found in {pickle_string=}" + ) + + return VigraRfLazyflowClassifierFactoryInfo(args=[arg]) + + +@dataclass +class ParallelVigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): + num_trees: int + label_proportion: Union[float, None] + variable_importance_path: Union[str, None] + variable_importance_enabled: bool + num_forests: int + + # ParallelVigraRfLazyflowClassifierFactory accepts additional kwargs, but we cannot deserialize arbitrary input. + # The parameters listed are all that we ever used in ilastik history. + @property + def instance(self) -> ParallelVigraRfLazyflowClassifierFactory: + return ParallelVigraRfLazyflowClassifierFactory( + num_trees_total=self.num_trees, + num_forests=self.num_forests, + variable_importance_path=self.variable_importance_path, + label_proportion=self.label_proportion, + variable_importance_enabled=self.variable_importance_enabled, + ) + + +def _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory( + pickle_string, +) -> ParallelVigraRfLazyflowClassifierFactoryInfo: + classifier_factory_num_trees_pickle_string_matcher = re.compile( + r""" + V_num_trees\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if m := classifier_factory_num_trees_pickle_string_matcher.search(pickle_string): + num_trees = int(m.groupdict()["num_trees"]) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_trees not found in {pickle_string=}" + ) + + # this can be None, otherwise float between 0.0 and 1.0 + classifier_factory_label_proportion_pickle_string_matcher = re.compile( + r""" + V_label_proportion\n + p\d+\n + F?(?P((?<=F)[01]\.\d+(?=\n))|N(?=s)) # positive lookbehind for float + """, + re.X, + ) + + if m := classifier_factory_label_proportion_pickle_string_matcher.search(pickle_string): + label_prop_string = m.groupdict()["label_proportion"] + label_proportion = None if label_prop_string == "N" else float(label_prop_string) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _label_proportion not found in {pickle_string=}" + ) + + # this can be None, otherwise string (V) + classifier_factory_variable_importance_path_pickle_string_matcher = re.compile( + r""" + V_variable_importance_path\n + p\d+\n + V?(?P((?<=V).+(?=\n))|N(?=s)) # positive lookbehind for string + """, + re.X, + ) + + if m := classifier_factory_variable_importance_path_pickle_string_matcher.search(pickle_string): + variable_importance_pth_string = m.groupdict()["variable_importance_path"] + variable_importance_path = None if variable_importance_pth_string == "N" else variable_importance_pth_string + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_path not found in {pickle_string=}" + ) + + # will be a bool, either I00, or I01 + classifier_factory_variable_importance_enabled_pickle_string_matcher = re.compile( + r""" + V_variable_importance_enabled\n + p\d+\n + I(?P[01]{2})\n + """, + re.X, + ) + + if m := classifier_factory_variable_importance_enabled_pickle_string_matcher.search(pickle_string): + variable_importance_enabled = bool(int(m.groupdict()["variable_importance_enabled"])) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_enabled not found in {pickle_string=}" + ) + + classifier_factory_num_forests_pickle_string_matcher = re.compile( + r""" + V_num_forests\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + + if m := classifier_factory_num_forests_pickle_string_matcher.search(pickle_string): + num_forests = int(m.groupdict()["num_forests"]) + else: + raise ValueError( + f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_forests not found in {pickle_string=}" + ) + + return ParallelVigraRfLazyflowClassifierFactoryInfo( + num_trees=num_trees, + label_proportion=label_proportion, + variable_importance_path=variable_importance_path, + variable_importance_enabled=variable_importance_enabled, + num_forests=num_forests, + ) + + +SklearnClassifierType = Union[ + Type[AdaBoostClassifier], + Type[DecisionTreeClassifier], + Type[GaussianNB], + Type[KNeighborsClassifier], + Type[LinearDiscriminantAnalysis], + Type[NuSVC], + Type[QuadraticDiscriminantAnalysis], + Type[RandomForestClassifier], + Type[SVC], +] + + +@dataclass +class SklearnClassifierTypeInfo: + submodules: List[str] + typename: str + + @property + def classifier_type(self) -> SklearnClassifierType: + submodule = getattr(sklearn, self.submodules[0]) + for sm_name in self.submodules[1:]: + submodule = getattr(submodule, sm_name) + + classifier_type = getattr(submodule, self.typename) + return classifier_type + + +@dataclass +class SklearnClassifierFactoryInfo(ClassifierFactoryInfo): + classifier_type: SklearnClassifierType + args: List[int] + kwargs: Dict[str, Union[bool, int, float]] + + @property + def instance(self) -> LazyflowClassifierFactoryABC: + return SklearnLazyflowClassifierFactory(self.classifier_type, *self.args, **self.kwargs) + + +def _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClassifierFactoryInfo: + """ + _args : RandomForestClassifier, 100 | GaussianNB | AdaBoostClassifier | DecisionTreeClassifier | KNeighborsClassifier | LDA | QDA | SVC | NuSVC + _kwargs NONE | NONE | n_estimators=100 | max_depth=5 | NONE | N NONE | NONE | probability=True | probability=True + _classifier_type + + """ + classifier_factory_sklearn_type_pickle_string_matcher = re.compile( + """ + V_classifier_type\\n + p\\d+\\n + c + sklearn\\.(?P[\\w+\\.]+)\\n + (?P[\\w]+)\\n + """, + re.X, + ) + + sklearn_submodule_allow_list = [ + "discriminant_analysis", + "ensemble._forest", + "ensemble._weight_boosting", + "naive_bayes", + "neighbors._classification", + "svm._classes", + "tree._classes", + ] + + sklearn_classifier_allow_list = [ + "AdaBoostClassifier", + "DecisionTreeClassifier", + "GaussianNB", + "KNeighborsClassifier", + "LinearDiscriminantAnalysis", + "NuSVC", + "QuadraticDiscriminantAnalysis", + "RandomForestClassifier", + "SVC", + ] + + if m := classifier_factory_sklearn_type_pickle_string_matcher.search(pickle_string): + groupdict = m.groupdict() + submodules = groupdict["submodules"] + typename = groupdict["typename"] + + if submodules not in sklearn_submodule_allow_list or typename not in sklearn_classifier_allow_list: + raise ValueError(f"Classifier of type sklearn.{submodules}.{typename} not permitted.") + + else: + raise ValueError(f"Could not load classifier type {pickle_string=}") + + classifier_info = SklearnClassifierTypeInfo(submodules=submodules.split("."), typename=typename) + classifier_type = classifier_info.classifier_type + + return _deserialize_sklearn_classifier_details(classifier_type, pickle_string) + + +def _deserialize_sklearn_classifier_details( + classifier_type: SklearnClassifierType, pickle_str: str +) -> SklearnClassifierFactoryInfo: + if issubclass(classifier_type, RandomForestClassifier): + return _deserialize_sklearn_RandomForest_details(pickle_str) + + if issubclass(classifier_type, AdaBoostClassifier): + return _deserialize_sklearn_AdaBoostClassifier_details(pickle_str) + + if issubclass(classifier_type, DecisionTreeClassifier): + return _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str) + + if issubclass(classifier_type, (SVC, NuSVC)): + return _deserialize_sklearn_SVC_details(pickle_str, classifier_type) + + if issubclass( + classifier_type, + ( + GaussianNB, + KNeighborsClassifier, + LinearDiscriminantAnalysis, + QuadraticDiscriminantAnalysis, + ), + ): + return SklearnClassifierFactoryInfo(classifier_type=classifier_type, args=[], kwargs={}) + + +def _deserialize_sklearn_RandomForest_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_args_pickle_string_matcher = re.compile( + r""" + V_args\n + p\d+\n + \( + I(?P\d+)\n # we _only_ expect one integer element in _args for this type + """, + re.X, + ) + + if m := classifier_factory_args_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=RandomForestClassifier, args=[int(m.groupdict()["arg"])], kwargs={} + ) + else: + raise ValueError("Could not deserialize sklearn RandomForest classifier.") + + +def _deserialize_sklearn_AdaBoostClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_n_estimators_pickle_string_matcher = re.compile( + r""" + Vn_estimators\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + if m := classifier_factory_n_estimators_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": int(m.groupdict()["n_estimators"])} + ) + else: + raise ValueError("Could not deserialize sklearn AdaBoostClassifier.") + + +def _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: + classifier_factory_max_depth_pickle_string_matcher = re.compile( + r""" + Vmax_depth\n + p\d+\n + I(?P\d+)\n + """, + re.X, + ) + if m := classifier_factory_max_depth_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": int(m.groupdict()["max_depth"])} + ) + else: + raise ValueError("Could not deserialize sklearn DecisionTreeClassifier") + + +def _deserialize_sklearn_SVC_details( + pickle_str: str, classifier_type: Union[Type[SVC], Type[NuSVC]] +) -> SklearnClassifierFactoryInfo: + classifier_factory_probability_pickle_string_matcher = re.compile( + r""" + Vprobability\n + p\d+\n + I(?P[01]{2})\n + """, + re.X, + ) + if m := classifier_factory_probability_pickle_string_matcher.search(pickle_str): + return SklearnClassifierFactoryInfo( + classifier_type=classifier_type, args=[], kwargs={"probability": int(m.groupdict()["probability"]) != 0} + ) + else: + raise ValueError("Could not deserialize sklearn SVC/NuSVC classifier.") diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index bd50c2382a..29e15a8867 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -18,31 +18,9 @@ # on the ilastik web site at: # http://ilastik.org/license.html ############################################################################### -import re -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Dict, List, Sequence, Tuple, Type, Union +from typing import Sequence, Tuple, Union import h5py -import sklearn -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis -from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC, NuSVC -from sklearn.tree import DecisionTreeClassifier - -import lazyflow.classifiers -from lazyflow.classifiers.lazyflowClassifier import ( - LazyflowPixelwiseClassifierABC, - LazyflowPixelwiseClassifierFactoryABC, - LazyflowVectorwiseClassifierABC, - LazyflowVectorwiseClassifierFactoryABC, -) -from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory -from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory -from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifierFactory -from lazyflow.classifiers.vigraRfPixelwiseClassifier import VigraRfPixelwiseClassifierFactory def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: @@ -109,549 +87,3 @@ def stringToSlicing(strSlicing: Union[bytes, str]) -> Tuple[slice, ...]: def deserialize_string_from_h5(ds: h5py.Dataset): return ds[()].decode() - - -LazyflowClassifierABC = Union[LazyflowPixelwiseClassifierABC, LazyflowVectorwiseClassifierABC] - -LazyflowClassifierTypeABC = Union[Type[LazyflowPixelwiseClassifierABC], Type[LazyflowVectorwiseClassifierABC]] - - -_lazyflow_classifier_factory_submodule_allow_list = [ - "vigraRfPixelwiseClassifier", - "vigraRfLazyflowClassifier", - "parallelVigraRfLazyflowClassifier", - "sklearnLazyflowClassifier", -] - -_lazyflow_classifier_factory_type_allow_list = [ - "VigraRfPixelwiseClassifierFactory", - "VigraRfLazyflowClassifierFactory", - "ParallelVigraRfLazyflowClassifierFactory", - "SklearnLazyflowClassifierFactory", -] - -_lazyflow_classifier_type_allow_list = [ - "VigraRfPixelwiseClassifier", - "VigraRfLazyflowClassifier", - "ParallelVigraRfLazyflowClassifier", - "SklearnLazyflowClassifier", -] - - -@dataclass -class ClassifierInfo: - submodule_name: str - type_name: str - - @property - def classifier_type(self) -> LazyflowClassifierTypeABC: - submodule = getattr(lazyflow.classifiers, self.submodule_name) - classifier_type = getattr(submodule, self.type_name) - return classifier_type - - -def deserialize_legacy_classifier_type_info(ds: h5py.Dataset) -> ClassifierInfo: - """Legacy helper for classifier type_info deserialization - - in order to avoid unpickling, the protocol0-style pickle string is - parsed to extract the classifier typename of the form - `lazyflow.classifier.myclassifier.MyClassifierType`, e.g. - `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifier`. - - Args: - ds: h5py dataset with that holds the pickled string - usually in - `PixelClassification/ClassifierForests/pickled_type` - - Returns: - Dictionary with two keys: `submodule_name`, and `typename` - - Raises: - ValueError if pickled string does not conform to required pattern - """ - class_string: str = deserialize_string_from_h5(ds) - classifier_pickle_string_matcher = re.compile( - r""" - c # GLOBAL - lazyflow\.classifiers\.(?P\w+) - \n - (?P\w+) - \n - p\d+ - \n - \. # all pickles end in "." STOP - $ - """, - re.X, - ) - - # legacy support - ilastik used to pickle the classifier type - if class_string.isascii() and (m := classifier_pickle_string_matcher.match(class_string)): - groupdict = m.groupdict() - - if groupdict["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: - raise ValueError(f"Could not load classifier: submodule {groupdict['submodule_name']} not allowed.") - - if groupdict["type_name"] not in _lazyflow_classifier_type_allow_list: - raise ValueError(f"Could not load classifier: type {groupdict['type_name']} not allowed.") - - return ClassifierInfo(**groupdict) - - raise ValueError(f"Could not load classifier type {class_string=}") - - -LazyflowClassifierFactoryABC = Union[LazyflowPixelwiseClassifierFactoryABC, LazyflowVectorwiseClassifierFactoryABC] - -LazyflowClassifierFactoryTypeABC = Union[ - Type[LazyflowPixelwiseClassifierFactoryABC], Type[LazyflowVectorwiseClassifierFactoryABC] -] - - -@dataclass -class ClassifierFactoryTypeInfo: - factory_submodule: str - factory_typename: str - factory_version: int - - @property - def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABC: - submod = getattr(lazyflow.classifiers, self.factory_submodule) - classifier_factory_type = getattr(submod, self.factory_typename) - return classifier_factory_type - - -class ClassifierFactoryInfo(ABC): - - @property - @abstractmethod - def instance(self) -> LazyflowClassifierFactoryABC: ... - - -def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: - pickle_string: str = deserialize_string_from_h5(ds) - classifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) - - classifier_factory_type = classifier_factory_info.classifier_factory_type - classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) - factory_instance = classifier_factory_details.instance - factory_instance.VERSION = classifier_factory_info.factory_version - return factory_instance - - -def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: - """Legacy helper for classifier type_info deserialization - - in order to avoid unpickling, the protocol0-style pickle string is - parsed to extract the classifier typename of the form - `lazyflow.classifier.myclassifier.MyClassifierTypeFactory`, e.g. - `lazyflow.classifier.vigraRfLazyflowClassifier.VigraRfLazyflowClassifierFactory`. - - Args: - pickle_string: string from pickling a LazyflowClassifierFactory instance - - Returns: - ClassifierFactoryTypeInfo with classifier information - - Raises: - ValueError if pickled string does not conform to required pattern - """ - - classifier_factory_pickle_string_matcher = re.compile( - r""" - clazyflow\.classifiers\.(?P\w+) - \n - (?P\w+) - \n - """, - re.X, - ) - - classifier_factory_version_pickle_string_matcher = re.compile( - r""" - VVERSION\n - p\d+\n - I(?P\d+)\n - """, - re.X, - ) - - if pickle_string.isascii() and (m := classifier_factory_pickle_string_matcher.search(pickle_string)): - groupdict = m.groupdict() - submodule = groupdict["factory_submodule"] - typename = groupdict["type_name"] - - if submodule not in _lazyflow_classifier_factory_submodule_allow_list: - raise ValueError(f"Could not load classifier: submodule {submodule} not allowed. {pickle_string=}") - - if typename not in _lazyflow_classifier_factory_type_allow_list: - raise ValueError(f"Could not load classifier factory: type {typename} not allowed.") - else: - raise ValueError(f"Could not load classifier factory type submodule and type not found {pickle_string=}") - - if m := classifier_factory_version_pickle_string_matcher.search(pickle_string): - version = int(m.groupdict()["factory_version"]) - else: - raise ValueError(f"Could not load classifier type, no version found {pickle_string=}") - - return ClassifierFactoryTypeInfo(factory_submodule=submodule, factory_typename=typename, factory_version=version) - - -def _deserialize_classifier_factory_details( - classifier_factory: LazyflowClassifierFactoryTypeABC, pickle_str: str -) -> ClassifierFactoryInfo: - - if issubclass(classifier_factory, (VigraRfPixelwiseClassifierFactory, VigraRfLazyflowClassifierFactory)): - return _deserialize_legacy_VigraRfClassifierFactory(pickle_str) - - if issubclass(classifier_factory, ParallelVigraRfLazyflowClassifierFactory): - return _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_str) - - if issubclass(classifier_factory, SklearnLazyflowClassifierFactory): - return _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_str) - - raise ValueError(f"Don't know how to deserialize classifier of type {classifier_factory!r}") - - -@dataclass -class VigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): - args: List[int] - - @property - def instance(self) -> VigraRfLazyflowClassifierFactory: - return VigraRfLazyflowClassifierFactory(*self.args) - - -def _deserialize_legacy_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: - """ - These classifier factories have only been used with a single arg - """ - classifier_factory_args_pickle_string_matcher = re.compile( - r""" - V_args\n - p\d+\n - (\(I)? - (?P(?<=\(I)\d+)\n # we _only_ expect one integer element in _args for this type - """, - re.X, - ) - - if m := classifier_factory_args_pickle_string_matcher.search(pickle_string): - arg = int(m.groupdict()["arg"]) - else: - raise ValueError( - f"Could not load VigraRfLazyflowClassifierFactory, no argument found not found in {pickle_string=}" - ) - - return VigraRfLazyflowClassifierFactoryInfo(args=[arg]) - - -@dataclass -class ParallelVigraRfLazyflowClassifierFactoryInfo(ClassifierFactoryInfo): - num_trees: int - label_proportion: Union[float, None] - variable_importance_path: Union[str, None] - variable_importance_enabled: bool - num_forests: int - - # ParallelVigraRfLazyflowClassifierFactory accepts additional kwargs, but we cannot deserialize arbitrary input. - # The parameters listed are all that we ever used in ilastik history. - @property - def instance(self) -> ParallelVigraRfLazyflowClassifierFactory: - return ParallelVigraRfLazyflowClassifierFactory( - num_trees_total=self.num_trees, - num_forests=self.num_forests, - variable_importance_path=self.variable_importance_path, - label_proportion=self.label_proportion, - variable_importance_enabled=self.variable_importance_enabled, - ) - - -def _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory( - pickle_string, -) -> ParallelVigraRfLazyflowClassifierFactoryInfo: - classifier_factory_num_trees_pickle_string_matcher = re.compile( - r""" - V_num_trees\n - p\d+\n - I(?P\d+)\n - """, - re.X, - ) - - if m := classifier_factory_num_trees_pickle_string_matcher.search(pickle_string): - num_trees = int(m.groupdict()["num_trees"]) - else: - raise ValueError( - f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_trees not found in {pickle_string=}" - ) - - # this can be None, otherwise float between 0.0 and 1.0 - classifier_factory_label_proportion_pickle_string_matcher = re.compile( - r""" - V_label_proportion\n - p\d+\n - F?(?P((?<=F)[01]\.\d+(?=\n))|N(?=s)) # positive lookbehind for float - """, - re.X, - ) - - if m := classifier_factory_label_proportion_pickle_string_matcher.search(pickle_string): - label_prop_string = m.groupdict()["label_proportion"] - label_proportion = None if label_prop_string == "N" else float(label_prop_string) - else: - raise ValueError( - f"Could not load ParallelVigraRfLazyflowClassifierFactory, _label_proportion not found in {pickle_string=}" - ) - - # this can be None, otherwise string (V) - classifier_factory_variable_importance_path_pickle_string_matcher = re.compile( - r""" - V_variable_importance_path\n - p\d+\n - V?(?P((?<=V).+(?=\n))|N(?=s)) # positive lookbehind for string - """, - re.X, - ) - - if m := classifier_factory_variable_importance_path_pickle_string_matcher.search(pickle_string): - variable_importance_pth_string = m.groupdict()["variable_importance_path"] - variable_importance_path = None if variable_importance_pth_string == "N" else variable_importance_pth_string - else: - raise ValueError( - f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_path not found in {pickle_string=}" - ) - - # will be a bool, either I00, or I01 - classifier_factory_variable_importance_enabled_pickle_string_matcher = re.compile( - r""" - V_variable_importance_enabled\n - p\d+\n - I(?P[01]{2})\n - """, - re.X, - ) - - if m := classifier_factory_variable_importance_enabled_pickle_string_matcher.search(pickle_string): - variable_importance_enabled = bool(int(m.groupdict()["variable_importance_enabled"])) - else: - raise ValueError( - f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_enabled not found in {pickle_string=}" - ) - - classifier_factory_num_forests_pickle_string_matcher = re.compile( - r""" - V_num_forests\n - p\d+\n - I(?P\d+)\n - """, - re.X, - ) - - if m := classifier_factory_num_forests_pickle_string_matcher.search(pickle_string): - num_forests = int(m.groupdict()["num_forests"]) - else: - raise ValueError( - f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_forests not found in {pickle_string=}" - ) - - return ParallelVigraRfLazyflowClassifierFactoryInfo( - num_trees=num_trees, - label_proportion=label_proportion, - variable_importance_path=variable_importance_path, - variable_importance_enabled=variable_importance_enabled, - num_forests=num_forests, - ) - - -SklearnClassifierType = Union[ - Type[AdaBoostClassifier], - Type[DecisionTreeClassifier], - Type[GaussianNB], - Type[KNeighborsClassifier], - Type[LinearDiscriminantAnalysis], - Type[NuSVC], - Type[QuadraticDiscriminantAnalysis], - Type[RandomForestClassifier], - Type[SVC], -] - - -@dataclass -class SklearnClassifierTypeInfo: - submodules: List[str] - typename: str - - @property - def classifier_type(self) -> SklearnClassifierType: - submodule = getattr(sklearn, self.submodules[0]) - for sm_name in self.submodules[1:]: - submodule = getattr(submodule, sm_name) - - classifier_type = getattr(submodule, self.typename) - return classifier_type - - -@dataclass -class SklearnClassifierFactoryInfo(ClassifierFactoryInfo): - classifier_type: SklearnClassifierType - args: List[int] - kwargs: Dict[str, Union[bool, int, float]] - - @property - def instance(self) -> LazyflowClassifierFactoryABC: - return SklearnLazyflowClassifierFactory(self.classifier_type, *self.args, **self.kwargs) - - -def _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClassifierFactoryInfo: - """ - _args : RandomForestClassifier, 100 | GaussianNB | AdaBoostClassifier | DecisionTreeClassifier | KNeighborsClassifier | LDA | QDA | SVC | NuSVC - _kwargs NONE | NONE | n_estimators=100 | max_depth=5 | NONE | N NONE | NONE | probability=True | probability=True - _classifier_type - - """ - classifier_factory_sklearn_type_pickle_string_matcher = re.compile( - """ - V_classifier_type\\n - p\\d+\\n - c - sklearn\\.(?P[\\w+\\.]+)\\n - (?P[\\w]+)\\n - """, - re.X, - ) - - sklearn_submodule_allow_list = [ - "discriminant_analysis", - "ensemble._forest", - "ensemble._weight_boosting", - "naive_bayes", - "neighbors._classification", - "svm._classes", - "tree._classes", - ] - - sklearn_classifier_allow_list = [ - "AdaBoostClassifier", - "DecisionTreeClassifier", - "GaussianNB", - "KNeighborsClassifier", - "LinearDiscriminantAnalysis", - "NuSVC", - "QuadraticDiscriminantAnalysis", - "RandomForestClassifier", - "SVC", - ] - - if m := classifier_factory_sklearn_type_pickle_string_matcher.search(pickle_string): - groupdict = m.groupdict() - submodules = groupdict["submodules"] - typename = groupdict["typename"] - - if submodules not in sklearn_submodule_allow_list or typename not in sklearn_classifier_allow_list: - raise ValueError(f"Classifier of type sklearn.{submodules}.{typename} not permitted.") - - else: - raise ValueError(f"Could not load classifier type {pickle_string=}") - - classifier_info = SklearnClassifierTypeInfo(submodules=submodules.split("."), typename=typename) - classifier_type = classifier_info.classifier_type - - return _deserialize_sklearn_classifier_details(classifier_type, pickle_string) - - -def _deserialize_sklearn_classifier_details( - classifier_type: SklearnClassifierType, pickle_str: str -) -> SklearnClassifierFactoryInfo: - if issubclass(classifier_type, RandomForestClassifier): - return _deserialize_sklearn_RandomForest_details(pickle_str) - - if issubclass(classifier_type, AdaBoostClassifier): - return _deserialize_sklearn_AdaBoostClassifier_details(pickle_str) - - if issubclass(classifier_type, DecisionTreeClassifier): - return _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str) - - if issubclass(classifier_type, (SVC, NuSVC)): - return _deserialize_sklearn_SVC_details(pickle_str, classifier_type) - - if issubclass( - classifier_type, - ( - GaussianNB, - KNeighborsClassifier, - LinearDiscriminantAnalysis, - QuadraticDiscriminantAnalysis, - ), - ): - return SklearnClassifierFactoryInfo(classifier_type=classifier_type, args=[], kwargs={}) - - -def _deserialize_sklearn_RandomForest_details(pickle_str: str) -> SklearnClassifierFactoryInfo: - classifier_factory_args_pickle_string_matcher = re.compile( - r""" - V_args\n - p\d+\n - \( - I(?P\d+)\n # we _only_ expect one integer element in _args for this type - """, - re.X, - ) - - if m := classifier_factory_args_pickle_string_matcher.search(pickle_str): - return SklearnClassifierFactoryInfo( - classifier_type=RandomForestClassifier, args=[int(m.groupdict()["arg"])], kwargs={} - ) - else: - raise ValueError("Could not deserialize sklearn RandomForest classifier.") - - -def _deserialize_sklearn_AdaBoostClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: - classifier_factory_n_estimators_pickle_string_matcher = re.compile( - r""" - Vn_estimators\n - p\d+\n - I(?P\d+)\n - """, - re.X, - ) - if m := classifier_factory_n_estimators_pickle_string_matcher.search(pickle_str): - return SklearnClassifierFactoryInfo( - classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": int(m.groupdict()["n_estimators"])} - ) - else: - raise ValueError("Could not deserialize sklearn AdaBoostClassifier.") - - -def _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str: str) -> SklearnClassifierFactoryInfo: - classifier_factory_max_depth_pickle_string_matcher = re.compile( - r""" - Vmax_depth\n - p\d+\n - I(?P\d+)\n - """, - re.X, - ) - if m := classifier_factory_max_depth_pickle_string_matcher.search(pickle_str): - return SklearnClassifierFactoryInfo( - classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": int(m.groupdict()["max_depth"])} - ) - else: - raise ValueError("Could not deserialize sklearn DecisionTreeClassifier") - - -def _deserialize_sklearn_SVC_details( - pickle_str: str, classifier_type: Union[Type[SVC], Type[NuSVC]] -) -> SklearnClassifierFactoryInfo: - classifier_factory_probability_pickle_string_matcher = re.compile( - r""" - Vprobability\n - p\d+\n - I(?P[01]{2})\n - """, - re.X, - ) - if m := classifier_factory_probability_pickle_string_matcher.search(pickle_str): - return SklearnClassifierFactoryInfo( - classifier_type=classifier_type, args=[], kwargs={"probability": int(m.groupdict()["probability"]) != 0} - ) - else: - raise ValueError("Could not deserialize sklearn SVC/NuSVC classifier.") diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 415197b754..33f90667bd 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -33,19 +33,23 @@ from ilastik import Project from ilastik.utility.commandLineProcessing import convertStringToList from ilastik.utility.maybe import maybe +from lazyflow.operators.valueProviders import OpValueCache from lazyflow.roi import TinyVector, roiToSlice, sliceToRoi from lazyflow.slot import OutputSlot, Slot from lazyflow.utility import timeLogged from . import jsonSerializerRegistry +from .legacyClassifiers import ( + deserialize_legacy_classifier_type, + deserialize_legacy_classifier_factory, +) from .serializerUtils import ( deleteIfPresent, slicingToString, stringToSlicing, - deserialize_legacy_classifier_type_info, - deserialize_legacy_classifier_factory, ) + logger = logging.getLogger(__name__) @@ -607,7 +611,7 @@ def extract_index(s): class SerialClassifierSlot(SerialSlot): """For saving a classifier. Here we assume the classifier is stored in the .""" - def __init__(self, slot, cache, inslot=None, name=None, default=None, depends=None, selfdepends=True): + def __init__(self, slot, cache: OpValueCache, inslot=None, name=None, default=None, depends=None, selfdepends=True): super(SerialClassifierSlot, self).__init__(slot, inslot, name, None, default, depends, selfdepends) self.cache = cache if self.name is None: @@ -642,13 +646,19 @@ def deserialize(self, group): def _deserialize(self, classifierGroup, slot): try: - classifier_type = deserialize_legacy_classifier_type_info(classifierGroup["pickled_type"]).classifier_type + classifier_type = deserialize_legacy_classifier_type(classifierGroup["pickled_type"]) except KeyError: # For compatibility with old project files, choose the default classifier. from lazyflow.classifiers import ParallelVigraRfLazyflowClassifier classifier_type = ParallelVigraRfLazyflowClassifier + except ValueError: + warnings.warn( + "Unexpected classifier found in project file - cannot deserialize - classifier will need to be retrained." + ) + return + try: classifier = classifier_type.deserialize_hdf5(classifierGroup) except: @@ -668,7 +678,7 @@ def _deserialize(self, classifierGroup, slot): class SerialCountingSlot(SerialSlot): """For saving a random forest classifier.""" - def __init__(self, slot, cache, inslot=None, name=None, default=None, depends=None, selfdepends=True): + def __init__(self, slot, cache: OpValueCache, inslot=None, name=None, default=None, depends=None, selfdepends=True): super(SerialCountingSlot, self).__init__(slot, inslot, name, "wrapper{:04d}", default, depends, selfdepends) self.cache = cache if self.name is None: diff --git a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py new file mode 100644 index 0000000000..524f18e976 --- /dev/null +++ b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py @@ -0,0 +1,463 @@ +############################################################################### +# ilastik: interactive learning and segmentation toolkit +# +# Copyright (C) 2011-2024, the ilastik developers +# +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# In addition, as a special exception, the copyright holders of +# ilastik give you permission to combine ilastik with applets, +# workflows and plugins which are not covered under the GNU +# General Public License. +# +# See the LICENSE file for details. License information is also available +# on the ilastik web site at: +# http://ilastik.org/license.html +############################################################################### +import pickle + +import h5py +import pytest +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC, NuSVC +from sklearn.tree import DecisionTreeClassifier + +from ilastik.applets.base.appletSerializer.legacyClassifiers import ( + ClassifierFactoryTypeInfo, + SklearnClassifierFactoryInfo, + _deserialize_classifier_factory_details, + _deserialize_legacy_classifier_factory_type_info, + _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory, + _deserialize_legacy_SklearnLazyflowClassifierFactory, + _deserialize_legacy_VigraRfClassifierFactory, + _deserialize_sklearn_classifier_details, + deserialize_legacy_classifier_factory, + deserialize_legacy_classifier_type, +) +from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier, VigraRfLazyflowClassifierFactory + + +def test_deserialize_classifier(empty_in_memory_project_file: h5py.File): + classifier_bytes = b"clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifier\np0\n." + ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) + + classifier = deserialize_legacy_classifier_type(ds) + + assert issubclass(classifier, VigraRfLazyflowClassifier) + + +@pytest.mark.parametrize( + "classifier_bytes", + [ + b"clazyflow.class.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"csome.other_submodule.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"clazyflow.classifiers.sneakyVigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", + b"clazyflow.classifiers.vigraRfLazyflowClassifier\nSneakyVigraRfLazyflowClassifierFactory\np0\n.", + b"random.", + ], +) +def test_deserialize_classifier_raises(empty_in_memory_project_file: h5py.File, classifier_bytes: bytes): + ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) + with pytest.raises(ValueError): + _ = deserialize_legacy_classifier_type(ds) + + +@pytest.mark.parametrize( + "classifier_type, c_args, c_kwargs, expected_info", + [ + ( + AdaBoostClassifier, + [], + {"n_estimators": 257}, + SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), + ), + ( + DecisionTreeClassifier, + [], + {"max_depth": 257}, + SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), + ), + (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), + ( + KNeighborsClassifier, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), + ), + ( + LinearDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + QuadraticDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + RandomForestClassifier, + [143], + {}, + SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), + ), + ( + SVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), + ), + ( + NuSVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), + ), + ], +) +def test_sklearn_lazyflow_classifier_pickled_deserialization( + classifier_type, c_args, c_kwargs, expected_info: SklearnClassifierFactoryInfo +): + pickled_classifier = pickle.dumps( + SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 + ).decode() + deserialized_info = _deserialize_sklearn_classifier_details(classifier_type, pickled_classifier) + assert deserialized_info == expected_info + + +@pytest.mark.parametrize( + "classifier_type", + [ + AdaBoostClassifier, + DecisionTreeClassifier, + RandomForestClassifier, + SVC, + NuSVC, + ], +) +def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( + classifier_type, +): + with pytest.raises(ValueError, match="Could not deserialize"): + _ = _deserialize_sklearn_classifier_details(classifier_type, "someRandomString") + + +@pytest.mark.parametrize( + "classifier_type, c_args, c_kwargs, expected_info", + [ + ( + AdaBoostClassifier, + [], + {"n_estimators": 257}, + SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), + ), + ( + DecisionTreeClassifier, + [], + {"max_depth": 257}, + SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), + ), + (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), + ( + KNeighborsClassifier, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), + ), + ( + LinearDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + QuadraticDiscriminantAnalysis, + [], + {}, + SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), + ), + ( + RandomForestClassifier, + [143], + {}, + SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), + ), + ( + SVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), + ), + ( + NuSVC, + [], + {"probability": False}, + SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), + ), + ], +) +def test_deserialize_legacy_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): + assert True + pickled_classifier = pickle.dumps( + SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 + ).decode() + classifier_factory_info = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickled_classifier) + + assert classifier_factory_info == expected_info + + +@pytest.mark.parametrize( + "pickle_string", + [ + "V_classifier_type\np0\ncsklearn.some.submodules.not.in.list\nAdaBoostClassifier\n", + "V_classifier_type\np0\ncsklearn.neighbors._classification\nMyMeanClassifier42\n", + "someRandomString", + ], +) +def test_deserialize_legacy_SklearnLazyflowClassifierFactory_raises( + pickle_string, +): + with pytest.raises(ValueError): + _ = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(42, None, None, None, False), + ParallelVigraRfLazyflowClassifierFactory(43, 2, None, None, False), + ParallelVigraRfLazyflowClassifierFactory(44, None, "test_variable_importance_path", None, True), + ParallelVigraRfLazyflowClassifierFactory(45, None, None, 0.33, False), + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + ], +) +def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(classifier_factory): + info = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + + assert info.instance == classifier_factory + + +@pytest.mark.parametrize( + "pickle_string", + [ + "something_funny", + # wrong value types will raise: + "V_num_trees\np7\nF46\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nI300\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nF3.0\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI1\ns", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI01\nsV_num_forests\np14\nF89\n", + # missing values will raise: + "V_num_trees\np7\nF46\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\n\n", + "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\ns", + ], +) +def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pickle_string): + with pytest.raises(ValueError): + _ = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + VigraRfLazyflowClassifierFactory(30), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_legacy_VigraRflassifierFactory(classifier_factory): + info = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + + assert info.instance == classifier_factory + + +def test_deserialize_legacy_VigraRflassifierFactory_raises(): + with pytest.raises(ValueError): + _ = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_classifier_factory_details(classifier_factory): + info = _deserialize_classifier_factory_details( + type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() + ) + + assert info.instance == classifier_factory + + +class MyTestClassifierFactory: + def __init__(self, *args, **kwargs): + pass + + +def test_deserialize_classifier_factory_details_raises(): + + classifier_factory = MyTestClassifierFactory(42) + + with pytest.raises(ValueError): + _ = _deserialize_classifier_factory_details( + type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() # type: ignore[reportArgumentType] + ) + + +@pytest.mark.parametrize( + "classifier_factory, expected_info", + [ + ( + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + ClassifierFactoryTypeInfo( + factory_submodule="parallelVigraRfLazyflowClassifier", + factory_typename="ParallelVigraRfLazyflowClassifierFactory", + factory_version=ParallelVigraRfLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + ClassifierFactoryTypeInfo( + factory_submodule="sklearnLazyflowClassifier", + factory_typename="SklearnLazyflowClassifierFactory", + factory_version=SklearnLazyflowClassifierFactory.VERSION, + ), + ), + ( + VigraRfLazyflowClassifierFactory(42), + ClassifierFactoryTypeInfo( + factory_submodule="vigraRfLazyflowClassifier", + factory_typename="VigraRfLazyflowClassifierFactory", + factory_version=VigraRfLazyflowClassifierFactory.VERSION, + ), + ), + ], +) +def test_deserialize_legacy_classifier_factory_type_info(classifier_factory, expected_info): + info = _deserialize_legacy_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) + assert info == expected_info + + +@pytest.mark.parametrize( + "pickle_string", + [ + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nMyPhantasyFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.someothermodule\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", + "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nsV_args", + ], +) +def test_deserialize_legacy_classifier_factory_type_info_raises(pickle_string): + with pytest.raises(ValueError): + _ = _deserialize_legacy_classifier_factory_type_info(pickle_string) + + +@pytest.mark.parametrize( + "classifier_factory", + [ + ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), + SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), + SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), + SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), + SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), + SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), + SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), + SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), + SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), + VigraRfLazyflowClassifierFactory(42), + ], +) +def test_deserialize_legacy_classifier_factory(empty_in_memory_project_file, classifier_factory): + + ds = empty_in_memory_project_file.create_dataset(name="classifier", data=pickle.dumps(classifier_factory, 0)) + + factory = deserialize_legacy_classifier_factory(ds) + + assert factory == classifier_factory diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index b870c9f556..23a159600b 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -1,34 +1,34 @@ -import pickle +############################################################################### +# ilastik: interactive learning and segmentation toolkit +# +# Copyright (C) 2011-2024, the ilastik developers +# +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# In addition, as a special exception, the copyright holders of +# ilastik give you permission to combine ilastik with applets, +# workflows and plugins which are not covered under the GNU +# General Public License. +# +# See the LICENSE file for details. License information is also available +# on the ilastik web site at: +# http://ilastik.org/license.html +############################################################################### from typing import Sequence, Tuple, Union import h5py import pytest -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis -from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC, NuSVC -from sklearn.tree import DecisionTreeClassifier from ilastik.applets.base.appletSerializer.serializerUtils import ( - ClassifierFactoryTypeInfo, - SklearnClassifierFactoryInfo, - _deserialize_classifier_factory_details, - _deserialize_legacy_classifier_factory_type_info, - _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory, - _deserialize_legacy_SklearnLazyflowClassifierFactory, - _deserialize_legacy_VigraRfClassifierFactory, - _deserialize_sklearn_classifier_details, deleteIfPresent, - deserialize_legacy_classifier_factory, - deserialize_legacy_classifier_type_info, deserialize_string_from_h5, slicingToString, stringToSlicing, ) -from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory -from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory -from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier, VigraRfLazyflowClassifierFactory def test_deleteIfPresent_present(empty_in_memory_project_file: h5py.File): @@ -128,425 +128,3 @@ def test_deserialize_string_from_h5(empty_in_memory_project_file: h5py.File): ds = empty_in_memory_project_file.create_dataset("test", data=test_string.encode("utf-8")) assert deserialize_string_from_h5(ds) == test_string - - -def test_deserialize_classifier(empty_in_memory_project_file: h5py.File): - classifier_bytes = b"clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifier\np0\n." - expected_submodule = "vigraRfLazyflowClassifier" - expected_type = "VigraRfLazyflowClassifier" - ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) - - cl_info = deserialize_legacy_classifier_type_info(ds) - - assert cl_info.submodule_name == expected_submodule - assert cl_info.type_name == expected_type - - assert issubclass(cl_info.classifier_type, VigraRfLazyflowClassifier) - - -@pytest.mark.parametrize( - "classifier_bytes", - [ - b"clazyflow.class.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", - b"csome.other_submodule.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", - b"clazyflow.classifiers.sneakyVigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np0\n.", - b"clazyflow.classifiers.vigraRfLazyflowClassifier\nSneakyVigraRfLazyflowClassifierFactory\np0\n.", - b"random.", - ], -) -def test_deserialize_classifier_raises(empty_in_memory_project_file: h5py.File, classifier_bytes: bytes): - ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) - with pytest.raises(ValueError): - _ = deserialize_legacy_classifier_type_info(ds) - - -@pytest.mark.parametrize( - "classifier_type, c_args, c_kwargs, expected_info", - [ - ( - AdaBoostClassifier, - [], - {"n_estimators": 257}, - SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), - ), - ( - DecisionTreeClassifier, - [], - {"max_depth": 257}, - SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), - ), - (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), - ( - KNeighborsClassifier, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), - ), - ( - LinearDiscriminantAnalysis, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), - ), - ( - QuadraticDiscriminantAnalysis, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), - ), - ( - RandomForestClassifier, - [143], - {}, - SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), - ), - ( - SVC, - [], - {"probability": False}, - SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), - ), - ( - NuSVC, - [], - {"probability": False}, - SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), - ), - ], -) -def test_sklearn_lazyflow_classifier_pickled_deserialization( - classifier_type, c_args, c_kwargs, expected_info: SklearnClassifierFactoryInfo -): - pickled_classifier = pickle.dumps( - SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 - ).decode() - deserialized_info = _deserialize_sklearn_classifier_details(classifier_type, pickled_classifier) - assert deserialized_info == expected_info - - -@pytest.mark.parametrize( - "classifier_type", - [ - AdaBoostClassifier, - DecisionTreeClassifier, - RandomForestClassifier, - SVC, - NuSVC, - ], -) -def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( - classifier_type, -): - with pytest.raises(ValueError, match="Could not deserialize"): - _ = _deserialize_sklearn_classifier_details(classifier_type, "someRandomString") - - -@pytest.mark.parametrize( - "classifier_type, c_args, c_kwargs, expected_info", - [ - ( - AdaBoostClassifier, - [], - {"n_estimators": 257}, - SklearnClassifierFactoryInfo(classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": 257}), - ), - ( - DecisionTreeClassifier, - [], - {"max_depth": 257}, - SklearnClassifierFactoryInfo(classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": 257}), - ), - (GaussianNB, [], {}, SklearnClassifierFactoryInfo(classifier_type=GaussianNB, args=[], kwargs={})), - ( - KNeighborsClassifier, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=KNeighborsClassifier, args=[], kwargs={}), - ), - ( - LinearDiscriminantAnalysis, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=LinearDiscriminantAnalysis, args=[], kwargs={}), - ), - ( - QuadraticDiscriminantAnalysis, - [], - {}, - SklearnClassifierFactoryInfo(classifier_type=QuadraticDiscriminantAnalysis, args=[], kwargs={}), - ), - ( - RandomForestClassifier, - [143], - {}, - SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[143], kwargs={}), - ), - ( - SVC, - [], - {"probability": False}, - SklearnClassifierFactoryInfo(classifier_type=SVC, args=[], kwargs={"probability": False}), - ), - ( - NuSVC, - [], - {"probability": False}, - SklearnClassifierFactoryInfo(classifier_type=NuSVC, args=[], kwargs={"probability": False}), - ), - ], -) -def test_deserialize_legacy_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): - assert True - pickled_classifier = pickle.dumps( - SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 - ).decode() - classifier_factory_info = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickled_classifier) - - assert classifier_factory_info == expected_info - - -@pytest.mark.parametrize( - "pickle_string", - [ - "V_classifier_type\np0\ncsklearn.some.submodules.not.in.list\nAdaBoostClassifier\n", - "V_classifier_type\np0\ncsklearn.neighbors._classification\nMyMeanClassifier42\n", - "someRandomString", - ], -) -def test_deserialize_legacy_SklearnLazyflowClassifierFactory_raises( - pickle_string, -): - with pytest.raises(ValueError): - _ = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) - - -@pytest.mark.parametrize( - "classifier_factory", - [ - ParallelVigraRfLazyflowClassifierFactory(42, None, None, None, False), - ParallelVigraRfLazyflowClassifierFactory(43, 2, None, None, False), - ParallelVigraRfLazyflowClassifierFactory(44, None, "test_variable_importance_path", None, True), - ParallelVigraRfLazyflowClassifierFactory(45, None, None, 0.33, False), - ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), - ], -) -def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(classifier_factory): - info = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) - - assert info.instance == classifier_factory - - -@pytest.mark.parametrize( - "pickle_string", - [ - "something_funny", - # wrong value types will raise: - "V_num_trees\np7\nF46\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nI300\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nF3.0\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI1\ns", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\nI01\nsV_num_forests\np14\nF89\n", - # missing values will raise: - "V_num_trees\np7\nF46\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\n\n", - "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\ns", - ], -) -def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pickle_string): - with pytest.raises(ValueError): - _ = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_string) - - -@pytest.mark.parametrize( - "classifier_factory", - [ - VigraRfLazyflowClassifierFactory(30), - VigraRfLazyflowClassifierFactory(42), - ], -) -def test_deserialize_legacy_VigraRflassifierFactory(classifier_factory): - info = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) - - assert info.instance == classifier_factory - - -def test_deserialize_legacy_VigraRflassifierFactory_raises(): - with pytest.raises(ValueError): - _ = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) - - -@pytest.mark.parametrize( - "classifier_factory", - [ - ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), - SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), - SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), - SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), - SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), - SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), - SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), - SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), - SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), - SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), - VigraRfLazyflowClassifierFactory(42), - ], -) -def test_deserialize_classifier_factory_details(classifier_factory): - info = _deserialize_classifier_factory_details( - type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() - ) - - assert info.instance == classifier_factory - - -class MyTestClassifierFactory: - def __init__(self, *args, **kwargs): - pass - - -def test_deserialize_classifier_factory_details_raises(): - - classifier_factory = MyTestClassifierFactory(42) - - with pytest.raises(ValueError): - _ = _deserialize_classifier_factory_details( - type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() # type: ignore[reportArgumentType] - ) - - -@pytest.mark.parametrize( - "classifier_factory, expected_info", - [ - ( - ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), - ClassifierFactoryTypeInfo( - factory_submodule="parallelVigraRfLazyflowClassifier", - factory_typename="ParallelVigraRfLazyflowClassifierFactory", - factory_version=ParallelVigraRfLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), - ClassifierFactoryTypeInfo( - factory_submodule="sklearnLazyflowClassifier", - factory_typename="SklearnLazyflowClassifierFactory", - factory_version=SklearnLazyflowClassifierFactory.VERSION, - ), - ), - ( - VigraRfLazyflowClassifierFactory(42), - ClassifierFactoryTypeInfo( - factory_submodule="vigraRfLazyflowClassifier", - factory_typename="VigraRfLazyflowClassifierFactory", - factory_version=VigraRfLazyflowClassifierFactory.VERSION, - ), - ), - ], -) -def test_deserialize_legacy_classifier_factory_type_info(classifier_factory, expected_info): - info = _deserialize_legacy_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) - assert info == expected_info - - -@pytest.mark.parametrize( - "pickle_string", - [ - "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nMyPhantasyFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", - "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.someothermodule\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", - "ccopy_reg\n_reconstructor\np0\n(\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nVVERSION\np6\nI1\nsV_args", - "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nsV_args", - ], -) -def test_deserialize_legacy_classifier_factory_type_info_raises(pickle_string): - with pytest.raises(ValueError): - _ = _deserialize_legacy_classifier_factory_type_info(pickle_string) - - -@pytest.mark.parametrize( - "classifier_factory", - [ - ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), - SklearnLazyflowClassifierFactory(RandomForestClassifier, 143), - SklearnLazyflowClassifierFactory(classifier_type=AdaBoostClassifier, n_estimators=257), - SklearnLazyflowClassifierFactory(classifier_type=DecisionTreeClassifier, max_depth=257), - SklearnLazyflowClassifierFactory(classifier_type=GaussianNB), - SklearnLazyflowClassifierFactory(classifier_type=KNeighborsClassifier), - SklearnLazyflowClassifierFactory(classifier_type=LinearDiscriminantAnalysis), - SklearnLazyflowClassifierFactory(classifier_type=NuSVC, probability=False), - SklearnLazyflowClassifierFactory(classifier_type=QuadraticDiscriminantAnalysis), - SklearnLazyflowClassifierFactory(classifier_type=SVC, probability=False), - VigraRfLazyflowClassifierFactory(42), - ], -) -def test_deserialize_legacy_classifier_factory(empty_in_memory_project_file, classifier_factory): - - ds = empty_in_memory_project_file.create_dataset(name="classifier", data=pickle.dumps(classifier_factory, 0)) - - factory = deserialize_legacy_classifier_factory(ds) - - assert factory == classifier_factory From bcbb3534a1a953107b40d3097cba845cef785ca9 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 14:34:23 +0200 Subject: [PATCH 10/13] rename: remove legacy term in functions Co-authored-by: Benedikt Best <63287233+btbest@users.noreply.github.com> --- .../appletSerializer/legacyClassifiers.py | 20 ++++---- .../base/appletSerializer/slotSerializer.py | 8 ++-- .../base/test_legacyClassifierDeserialiers.py | 48 +++++++++---------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/legacyClassifiers.py b/ilastik/applets/base/appletSerializer/legacyClassifiers.py index b8ff632f61..e915716da5 100644 --- a/ilastik/applets/base/appletSerializer/legacyClassifiers.py +++ b/ilastik/applets/base/appletSerializer/legacyClassifiers.py @@ -85,7 +85,7 @@ def classifier_type(self) -> LazyflowClassifierTypeABC: return classifier_type -def deserialize_legacy_classifier_type(ds: h5py.Dataset) -> LazyflowClassifierTypeABC: +def deserialize_classifier_type(ds: h5py.Dataset) -> LazyflowClassifierTypeABC: """Legacy helper for classifier type_info deserialization in order to avoid unpickling, the protocol0-style pickle string is @@ -161,9 +161,9 @@ class ClassifierFactoryInfo(ABC): def instance(self) -> LazyflowClassifierFactoryABC: ... -def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: +def deserialize_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: pickle_string: str = deserialize_string_from_h5(ds) - classifier_factory_info = _deserialize_legacy_classifier_factory_type_info(pickle_string) + classifier_factory_info = _deserialize_classifier_factory_type_info(pickle_string) classifier_factory_type = classifier_factory_info.classifier_factory_type classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) @@ -172,7 +172,7 @@ def deserialize_legacy_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifie return factory_instance -def _deserialize_legacy_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: +def _deserialize_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: """Legacy helper for classifier type_info deserialization in order to avoid unpickling, the protocol0-style pickle string is @@ -235,13 +235,13 @@ def _deserialize_classifier_factory_details( ) -> ClassifierFactoryInfo: if issubclass(classifier_factory, (VigraRfPixelwiseClassifierFactory, VigraRfLazyflowClassifierFactory)): - return _deserialize_legacy_VigraRfClassifierFactory(pickle_str) + return _deserialize_VigraRfClassifierFactory(pickle_str) if issubclass(classifier_factory, ParallelVigraRfLazyflowClassifierFactory): - return _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_str) + return _deserialize_ParallelVigraRfLazyflowClassifierFactory(pickle_str) if issubclass(classifier_factory, SklearnLazyflowClassifierFactory): - return _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_str) + return _deserialize_SklearnLazyflowClassifierFactory(pickle_str) raise ValueError(f"Don't know how to deserialize classifier of type {classifier_factory!r}") @@ -255,7 +255,7 @@ def instance(self) -> VigraRfLazyflowClassifierFactory: return VigraRfLazyflowClassifierFactory(*self.args) -def _deserialize_legacy_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: +def _deserialize_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflowClassifierFactoryInfo: """ These classifier factories have only been used with a single arg """ @@ -300,7 +300,7 @@ def instance(self) -> ParallelVigraRfLazyflowClassifierFactory: ) -def _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory( +def _deserialize_ParallelVigraRfLazyflowClassifierFactory( pickle_string, ) -> ParallelVigraRfLazyflowClassifierFactoryInfo: classifier_factory_num_trees_pickle_string_matcher = re.compile( @@ -436,7 +436,7 @@ def instance(self) -> LazyflowClassifierFactoryABC: return SklearnLazyflowClassifierFactory(self.classifier_type, *self.args, **self.kwargs) -def _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClassifierFactoryInfo: +def _deserialize_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClassifierFactoryInfo: """ _args : RandomForestClassifier, 100 | GaussianNB | AdaBoostClassifier | DecisionTreeClassifier | KNeighborsClassifier | LDA | QDA | SVC | NuSVC _kwargs NONE | NONE | n_estimators=100 | max_depth=5 | NONE | N NONE | NONE | probability=True | probability=True diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 33f90667bd..9c96ad9eed 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -40,8 +40,8 @@ from . import jsonSerializerRegistry from .legacyClassifiers import ( - deserialize_legacy_classifier_type, - deserialize_legacy_classifier_factory, + deserialize_classifier_type, + deserialize_classifier_factory, ) from .serializerUtils import ( deleteIfPresent, @@ -646,7 +646,7 @@ def deserialize(self, group): def _deserialize(self, classifierGroup, slot): try: - classifier_type = deserialize_legacy_classifier_type(classifierGroup["pickled_type"]) + classifier_type = deserialize_classifier_type(classifierGroup["pickled_type"]) except KeyError: # For compatibility with old project files, choose the default classifier. from lazyflow.classifiers import ParallelVigraRfLazyflowClassifier @@ -854,7 +854,7 @@ def shouldSerialize(self, group): def _getValue(self, dset, slot): try: - value = deserialize_legacy_classifier_factory(dset) + value = deserialize_classifier_factory(dset) except ValueError: self._failed_to_deserialize = True diff --git a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py index 524f18e976..0a15dc06ee 100644 --- a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py +++ b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py @@ -33,13 +33,13 @@ ClassifierFactoryTypeInfo, SklearnClassifierFactoryInfo, _deserialize_classifier_factory_details, - _deserialize_legacy_classifier_factory_type_info, - _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory, - _deserialize_legacy_SklearnLazyflowClassifierFactory, - _deserialize_legacy_VigraRfClassifierFactory, + _deserialize_classifier_factory_type_info, + _deserialize_ParallelVigraRfLazyflowClassifierFactory, + _deserialize_SklearnLazyflowClassifierFactory, + _deserialize_VigraRfClassifierFactory, _deserialize_sklearn_classifier_details, - deserialize_legacy_classifier_factory, - deserialize_legacy_classifier_type, + deserialize_classifier_factory, + deserialize_classifier_type, ) from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory @@ -50,7 +50,7 @@ def test_deserialize_classifier(empty_in_memory_project_file: h5py.File): classifier_bytes = b"clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifier\np0\n." ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) - classifier = deserialize_legacy_classifier_type(ds) + classifier = deserialize_classifier_type(ds) assert issubclass(classifier, VigraRfLazyflowClassifier) @@ -68,7 +68,7 @@ def test_deserialize_classifier(empty_in_memory_project_file: h5py.File): def test_deserialize_classifier_raises(empty_in_memory_project_file: h5py.File, classifier_bytes: bytes): ds = empty_in_memory_project_file.create_dataset("classifier_type", data=classifier_bytes) with pytest.raises(ValueError): - _ = deserialize_legacy_classifier_type(ds) + _ = deserialize_classifier_type(ds) @pytest.mark.parametrize( @@ -206,12 +206,12 @@ def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( ), ], ) -def test_deserialize_legacy_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): +def test_deserialize_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): assert True pickled_classifier = pickle.dumps( SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 ).decode() - classifier_factory_info = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickled_classifier) + classifier_factory_info = _deserialize_SklearnLazyflowClassifierFactory(pickled_classifier) assert classifier_factory_info == expected_info @@ -224,11 +224,11 @@ def test_deserialize_legacy_SklearnLazyflowClassifierFactory(classifier_type, c_ "someRandomString", ], ) -def test_deserialize_legacy_SklearnLazyflowClassifierFactory_raises( +def test_deserialize_SklearnLazyflowClassifierFactory_raises( pickle_string, ): with pytest.raises(ValueError): - _ = _deserialize_legacy_SklearnLazyflowClassifierFactory(pickle_string) + _ = _deserialize_SklearnLazyflowClassifierFactory(pickle_string) @pytest.mark.parametrize( @@ -241,8 +241,8 @@ def test_deserialize_legacy_SklearnLazyflowClassifierFactory_raises( ParallelVigraRfLazyflowClassifierFactory(46, 89, "VVmyfunnyteststringVV", 1.0, True), ], ) -def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(classifier_factory): - info = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) +def test_deserialize_ParallelVigraRfLazyflowClassifierFactory(classifier_factory): + info = _deserialize_ParallelVigraRfLazyflowClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) assert info.instance == classifier_factory @@ -264,9 +264,9 @@ def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(classifier_ "V_num_trees\np7\nI46\nsV_label_proportion\np8\nF1.0\nsV_variable_importance_path\np9\nVtest\np10\nsV_variable_importance_enabled\np11\ns", ], ) -def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pickle_string): +def test_deserialize_ParallelVigraRfLazyflowClassifierFactory_raises(pickle_string): with pytest.raises(ValueError): - _ = _deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory(pickle_string) + _ = _deserialize_ParallelVigraRfLazyflowClassifierFactory(pickle_string) @pytest.mark.parametrize( @@ -277,14 +277,14 @@ def test_deserialize_legacy_ParallelVigraRfLazyflowClassifierFactory_raises(pick ], ) def test_deserialize_legacy_VigraRflassifierFactory(classifier_factory): - info = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) + info = _deserialize_VigraRfClassifierFactory(pickle.dumps(classifier_factory, 0).decode()) assert info.instance == classifier_factory def test_deserialize_legacy_VigraRflassifierFactory_raises(): with pytest.raises(ValueError): - _ = _deserialize_legacy_VigraRfClassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) + _ = _deserialize_VigraRfClassifierFactory(pickle.dumps(VigraRfLazyflowClassifierFactory(), 0).decode()) @pytest.mark.parametrize( @@ -419,8 +419,8 @@ def test_deserialize_classifier_factory_details_raises(): ), ], ) -def test_deserialize_legacy_classifier_factory_type_info(classifier_factory, expected_info): - info = _deserialize_legacy_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) +def test_deserialize_classifier_factory_type_info(classifier_factory, expected_info): + info = _deserialize_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) assert info == expected_info @@ -433,9 +433,9 @@ def test_deserialize_legacy_classifier_factory_type_info(classifier_factory, exp "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nsV_args", ], ) -def test_deserialize_legacy_classifier_factory_type_info_raises(pickle_string): +def test_deserialize_classifier_factory_type_info_raises(pickle_string): with pytest.raises(ValueError): - _ = _deserialize_legacy_classifier_factory_type_info(pickle_string) + _ = _deserialize_classifier_factory_type_info(pickle_string) @pytest.mark.parametrize( @@ -454,10 +454,10 @@ def test_deserialize_legacy_classifier_factory_type_info_raises(pickle_string): VigraRfLazyflowClassifierFactory(42), ], ) -def test_deserialize_legacy_classifier_factory(empty_in_memory_project_file, classifier_factory): +def test_deserialize_classifier_factory(empty_in_memory_project_file, classifier_factory): ds = empty_in_memory_project_file.create_dataset(name="classifier", data=pickle.dumps(classifier_factory, 0)) - factory = deserialize_legacy_classifier_factory(ds) + factory = deserialize_classifier_factory(ds) assert factory == classifier_factory From b8e3761b4202c2ccc2f402936e2d696430d3ec80 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 15:49:18 +0200 Subject: [PATCH 11/13] rename: remove return type from some functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit also added earlier failing of classifier factory deserialization. Co-authored-by: Benedikt Best <63287233+btbest@users.noreply.github.com> Co-authored-by: Fynn Beuttenmüller --- .../appletSerializer/legacyClassifiers.py | 42 ++++++++++++++----- .../base/appletSerializer/slotSerializer.py | 8 ---- .../base/test_legacyClassifierDeserialiers.py | 28 ++++++------- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/legacyClassifiers.py b/ilastik/applets/base/appletSerializer/legacyClassifiers.py index e915716da5..5fcc67db61 100644 --- a/ilastik/applets/base/appletSerializer/legacyClassifiers.py +++ b/ilastik/applets/base/appletSerializer/legacyClassifiers.py @@ -151,6 +151,12 @@ class ClassifierFactoryTypeInfo: def classifier_factory_type(self) -> LazyflowClassifierFactoryTypeABC: submod = getattr(lazyflow.classifiers, self.factory_submodule) classifier_factory_type = getattr(submod, self.factory_typename) + + if classifier_factory_type.VERSION != self.factory_version: + raise ValueError( + f"Version mismatch for classifier factory {self.factory_typename}, supporting {classifier_factory_type.VERSION}, but found {self.factory_version}." + ) + return classifier_factory_type @@ -162,17 +168,33 @@ def instance(self) -> LazyflowClassifierFactoryABC: ... def deserialize_classifier_factory(ds: h5py.Dataset) -> LazyflowClassifierFactoryABC: + """Load legacy pickled classifier factory from ilp dataset + + Deserialization happens in two steps: + 1) Common to all classifier factories are values for module, typename + and version + 2) Details for each factory are then deserialized separately + + args: + ds: dataset containing the classifier factory as a pickled string + + Returns: + instance of classifier factory as saved in the project file + + Raises: + ValueError: in case of problems, including version mismatch of the factory + """ pickle_string: str = deserialize_string_from_h5(ds) - classifier_factory_info = _deserialize_classifier_factory_type_info(pickle_string) + classifier_factory_info = _deserialize_classifier_factory_type(pickle_string) + + classifier_factory_details = _deserialize_classifier_factory_impl( + classifier_factory_info.classifier_factory_type, pickle_string + ) - classifier_factory_type = classifier_factory_info.classifier_factory_type - classifier_factory_details = _deserialize_classifier_factory_details(classifier_factory_type, pickle_string) - factory_instance = classifier_factory_details.instance - factory_instance.VERSION = classifier_factory_info.factory_version - return factory_instance + return classifier_factory_details.instance -def _deserialize_classifier_factory_type_info(pickle_string: str) -> ClassifierFactoryTypeInfo: +def _deserialize_classifier_factory_type(pickle_string: str) -> ClassifierFactoryTypeInfo: """Legacy helper for classifier type_info deserialization in order to avoid unpickling, the protocol0-style pickle string is @@ -230,7 +252,7 @@ def _deserialize_classifier_factory_type_info(pickle_string: str) -> ClassifierF return ClassifierFactoryTypeInfo(factory_submodule=submodule, factory_typename=typename, factory_version=version) -def _deserialize_classifier_factory_details( +def _deserialize_classifier_factory_impl( classifier_factory: LazyflowClassifierFactoryTypeABC, pickle_str: str ) -> ClassifierFactoryInfo: @@ -490,10 +512,10 @@ def _deserialize_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClass classifier_info = SklearnClassifierTypeInfo(submodules=submodules.split("."), typename=typename) classifier_type = classifier_info.classifier_type - return _deserialize_sklearn_classifier_details(classifier_type, pickle_string) + return _deserialize_sklearn_classifier(classifier_type, pickle_string) -def _deserialize_sklearn_classifier_details( +def _deserialize_sklearn_classifier( classifier_type: SklearnClassifierType, pickle_str: str ) -> SklearnClassifierFactoryInfo: if issubclass(classifier_type, RandomForestClassifier): diff --git a/ilastik/applets/base/appletSerializer/slotSerializer.py b/ilastik/applets/base/appletSerializer/slotSerializer.py index 9c96ad9eed..b128139b21 100644 --- a/ilastik/applets/base/appletSerializer/slotSerializer.py +++ b/ilastik/applets/base/appletSerializer/slotSerializer.py @@ -864,14 +864,6 @@ def _getValue(self, dset, slot): ) return - # Verify that the VERSION of the classifier factory in the currently executing code - # has not changed since this classifier was stored. - if not hasattr(value, "VERSION") or value.VERSION != type(value).VERSION: - warnings.warn( - "This project file uses an old or unsupported classifier-factory storage format. " - "When retraining, the default classifier-factory will be used." - ) - return slot.setValue(value) diff --git a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py index 0a15dc06ee..51070dc5b3 100644 --- a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py +++ b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py @@ -32,12 +32,12 @@ from ilastik.applets.base.appletSerializer.legacyClassifiers import ( ClassifierFactoryTypeInfo, SklearnClassifierFactoryInfo, - _deserialize_classifier_factory_details, - _deserialize_classifier_factory_type_info, + _deserialize_classifier_factory_impl, + _deserialize_classifier_factory_type, _deserialize_ParallelVigraRfLazyflowClassifierFactory, _deserialize_SklearnLazyflowClassifierFactory, _deserialize_VigraRfClassifierFactory, - _deserialize_sklearn_classifier_details, + _deserialize_sklearn_classifier, deserialize_classifier_factory, deserialize_classifier_type, ) @@ -131,7 +131,7 @@ def test_sklearn_lazyflow_classifier_pickled_deserialization( pickled_classifier = pickle.dumps( SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 ).decode() - deserialized_info = _deserialize_sklearn_classifier_details(classifier_type, pickled_classifier) + deserialized_info = _deserialize_sklearn_classifier(classifier_type, pickled_classifier) assert deserialized_info == expected_info @@ -149,7 +149,7 @@ def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( classifier_type, ): with pytest.raises(ValueError, match="Could not deserialize"): - _ = _deserialize_sklearn_classifier_details(classifier_type, "someRandomString") + _ = _deserialize_sklearn_classifier(classifier_type, "someRandomString") @pytest.mark.parametrize( @@ -303,10 +303,8 @@ def test_deserialize_legacy_VigraRflassifierFactory_raises(): VigraRfLazyflowClassifierFactory(42), ], ) -def test_deserialize_classifier_factory_details(classifier_factory): - info = _deserialize_classifier_factory_details( - type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() - ) +def test_deserialize_classifier_factory_impl(classifier_factory): + info = _deserialize_classifier_factory_impl(type(classifier_factory), pickle.dumps(classifier_factory, 0).decode()) assert info.instance == classifier_factory @@ -316,12 +314,12 @@ def __init__(self, *args, **kwargs): pass -def test_deserialize_classifier_factory_details_raises(): +def test_deserialize_classifier_factory_raises(): classifier_factory = MyTestClassifierFactory(42) with pytest.raises(ValueError): - _ = _deserialize_classifier_factory_details( + _ = _deserialize_classifier_factory_impl( type(classifier_factory), pickle.dumps(classifier_factory, 0).decode() # type: ignore[reportArgumentType] ) @@ -419,8 +417,8 @@ def test_deserialize_classifier_factory_details_raises(): ), ], ) -def test_deserialize_classifier_factory_type_info(classifier_factory, expected_info): - info = _deserialize_classifier_factory_type_info(pickle.dumps(classifier_factory, 0).decode()) +def test_deserialize_classifier_factory_type(classifier_factory, expected_info): + info = _deserialize_classifier_factory_type(pickle.dumps(classifier_factory, 0).decode()) assert info == expected_info @@ -433,9 +431,9 @@ def test_deserialize_classifier_factory_type_info(classifier_factory, expected_i "ccopy_reg\n_reconstructor\np0\n(clazyflow.classifiers.vigraRfLazyflowClassifier\nVigraRfLazyflowClassifierFactory\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nsV_args", ], ) -def test_deserialize_classifier_factory_type_info_raises(pickle_string): +def test_deserialize_classifier_factory_type_raises(pickle_string): with pytest.raises(ValueError): - _ = _deserialize_classifier_factory_type_info(pickle_string) + _ = _deserialize_classifier_factory_type(pickle_string) @pytest.mark.parametrize( From cf13402d8ae1d1229340c0d92895406c5f149f34 Mon Sep 17 00:00:00 2001 From: k-dominik Date: Fri, 26 Apr 2024 16:37:27 +0200 Subject: [PATCH 12/13] use direct access to groups in match objects Co-authored-by: Emil Melnikov --- .../appletSerializer/legacyClassifiers.py | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/legacyClassifiers.py b/ilastik/applets/base/appletSerializer/legacyClassifiers.py index 5fcc67db61..4bc8d161ec 100644 --- a/ilastik/applets/base/appletSerializer/legacyClassifiers.py +++ b/ilastik/applets/base/appletSerializer/legacyClassifiers.py @@ -121,15 +121,15 @@ def deserialize_classifier_type(ds: h5py.Dataset) -> LazyflowClassifierTypeABC: # legacy support - ilastik used to pickle the classifier type if class_string.isascii() and (m := classifier_pickle_string_matcher.match(class_string)): - groupdict = m.groupdict() + m - if groupdict["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: - raise ValueError(f"Could not load classifier: submodule {groupdict['submodule_name']} not allowed.") + if m["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: + raise ValueError(f"Could not load classifier: submodule {m['submodule_name']} not allowed.") - if groupdict["type_name"] not in _lazyflow_classifier_type_allow_list: - raise ValueError(f"Could not load classifier: type {groupdict['type_name']} not allowed.") + if m["type_name"] not in _lazyflow_classifier_type_allow_list: + raise ValueError(f"Could not load classifier: type {m['type_name']} not allowed.") - return ClassifierInfo(**groupdict).classifier_type + return ClassifierInfo(**m.groupdict()).classifier_type raise ValueError(f"Could not load classifier type {class_string=}") @@ -232,9 +232,8 @@ def _deserialize_classifier_factory_type(pickle_string: str) -> ClassifierFactor ) if pickle_string.isascii() and (m := classifier_factory_pickle_string_matcher.search(pickle_string)): - groupdict = m.groupdict() - submodule = groupdict["factory_submodule"] - typename = groupdict["type_name"] + submodule = m["factory_submodule"] + typename = m["type_name"] if submodule not in _lazyflow_classifier_factory_submodule_allow_list: raise ValueError(f"Could not load classifier: submodule {submodule} not allowed. {pickle_string=}") @@ -245,7 +244,7 @@ def _deserialize_classifier_factory_type(pickle_string: str) -> ClassifierFactor raise ValueError(f"Could not load classifier factory type submodule and type not found {pickle_string=}") if m := classifier_factory_version_pickle_string_matcher.search(pickle_string): - version = int(m.groupdict()["factory_version"]) + version = int(m["factory_version"]) else: raise ValueError(f"Could not load classifier type, no version found {pickle_string=}") @@ -292,7 +291,7 @@ def _deserialize_VigraRfClassifierFactory(pickle_string: str) -> VigraRfLazyflow ) if m := classifier_factory_args_pickle_string_matcher.search(pickle_string): - arg = int(m.groupdict()["arg"]) + arg = int(m["arg"]) else: raise ValueError( f"Could not load VigraRfLazyflowClassifierFactory, no argument found not found in {pickle_string=}" @@ -335,7 +334,7 @@ def _deserialize_ParallelVigraRfLazyflowClassifierFactory( ) if m := classifier_factory_num_trees_pickle_string_matcher.search(pickle_string): - num_trees = int(m.groupdict()["num_trees"]) + num_trees = int(m["num_trees"]) else: raise ValueError( f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_trees not found in {pickle_string=}" @@ -352,7 +351,7 @@ def _deserialize_ParallelVigraRfLazyflowClassifierFactory( ) if m := classifier_factory_label_proportion_pickle_string_matcher.search(pickle_string): - label_prop_string = m.groupdict()["label_proportion"] + label_prop_string = m["label_proportion"] label_proportion = None if label_prop_string == "N" else float(label_prop_string) else: raise ValueError( @@ -370,7 +369,7 @@ def _deserialize_ParallelVigraRfLazyflowClassifierFactory( ) if m := classifier_factory_variable_importance_path_pickle_string_matcher.search(pickle_string): - variable_importance_pth_string = m.groupdict()["variable_importance_path"] + variable_importance_pth_string = m["variable_importance_path"] variable_importance_path = None if variable_importance_pth_string == "N" else variable_importance_pth_string else: raise ValueError( @@ -388,7 +387,7 @@ def _deserialize_ParallelVigraRfLazyflowClassifierFactory( ) if m := classifier_factory_variable_importance_enabled_pickle_string_matcher.search(pickle_string): - variable_importance_enabled = bool(int(m.groupdict()["variable_importance_enabled"])) + variable_importance_enabled = bool(int(m["variable_importance_enabled"])) else: raise ValueError( f"Could not load ParallelVigraRfLazyflowClassifierFactory, _variable_importance_enabled not found in {pickle_string=}" @@ -404,7 +403,7 @@ def _deserialize_ParallelVigraRfLazyflowClassifierFactory( ) if m := classifier_factory_num_forests_pickle_string_matcher.search(pickle_string): - num_forests = int(m.groupdict()["num_forests"]) + num_forests = int(m["num_forests"]) else: raise ValueError( f"Could not load ParallelVigraRfLazyflowClassifierFactory, _num_forests not found in {pickle_string=}" @@ -499,9 +498,8 @@ def _deserialize_SklearnLazyflowClassifierFactory(pickle_string) -> SklearnClass ] if m := classifier_factory_sklearn_type_pickle_string_matcher.search(pickle_string): - groupdict = m.groupdict() - submodules = groupdict["submodules"] - typename = groupdict["typename"] + submodules = m["submodules"] + typename = m["typename"] if submodules not in sklearn_submodule_allow_list or typename not in sklearn_classifier_allow_list: raise ValueError(f"Classifier of type sklearn.{submodules}.{typename} not permitted.") @@ -554,9 +552,7 @@ def _deserialize_sklearn_RandomForest_details(pickle_str: str) -> SklearnClassif ) if m := classifier_factory_args_pickle_string_matcher.search(pickle_str): - return SklearnClassifierFactoryInfo( - classifier_type=RandomForestClassifier, args=[int(m.groupdict()["arg"])], kwargs={} - ) + return SklearnClassifierFactoryInfo(classifier_type=RandomForestClassifier, args=[int(m["arg"])], kwargs={}) else: raise ValueError("Could not deserialize sklearn RandomForest classifier.") @@ -572,7 +568,7 @@ def _deserialize_sklearn_AdaBoostClassifier_details(pickle_str: str) -> SklearnC ) if m := classifier_factory_n_estimators_pickle_string_matcher.search(pickle_str): return SklearnClassifierFactoryInfo( - classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": int(m.groupdict()["n_estimators"])} + classifier_type=AdaBoostClassifier, args=[], kwargs={"n_estimators": int(m["n_estimators"])} ) else: raise ValueError("Could not deserialize sklearn AdaBoostClassifier.") @@ -589,7 +585,7 @@ def _deserialize_sklearn_DecisionTreeClassifier_details(pickle_str: str) -> Skle ) if m := classifier_factory_max_depth_pickle_string_matcher.search(pickle_str): return SklearnClassifierFactoryInfo( - classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": int(m.groupdict()["max_depth"])} + classifier_type=DecisionTreeClassifier, args=[], kwargs={"max_depth": int(m["max_depth"])} ) else: raise ValueError("Could not deserialize sklearn DecisionTreeClassifier") @@ -608,7 +604,7 @@ def _deserialize_sklearn_SVC_details( ) if m := classifier_factory_probability_pickle_string_matcher.search(pickle_str): return SklearnClassifierFactoryInfo( - classifier_type=classifier_type, args=[], kwargs={"probability": int(m.groupdict()["probability"]) != 0} + classifier_type=classifier_type, args=[], kwargs={"probability": int(m["probability"]) != 0} ) else: raise ValueError("Could not deserialize sklearn SVC/NuSVC classifier.") From fb29c447cbac60d2684bba1b9a5aa8c23fbfd93f Mon Sep 17 00:00:00 2001 From: k-dominik Date: Mon, 6 May 2024 14:46:35 +0200 Subject: [PATCH 13/13] fix deserialization of sklearn classifiers * string decoding from `hdf5` should work also for strings wrapped in `numpy.void`. * `sklearn` classifier types serialize more data - so for generic classifier type deserialization previous assumptions of the pickle structure don't hold anymore (there could be more data before `STOP`). Co-authored-by: Benedikt Best <63287233+btbest@users.noreply.github.com> --- .../appletSerializer/legacyClassifiers.py | 4 +--- .../base/appletSerializer/serializerUtils.py | 7 ++++++- .../base/test_legacyClassifierDeserialiers.py | 20 ++++++++++++++++--- .../test_applets/base/test_serializerUtils.py | 8 ++++++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/ilastik/applets/base/appletSerializer/legacyClassifiers.py b/ilastik/applets/base/appletSerializer/legacyClassifiers.py index 4bc8d161ec..e6f1efe91e 100644 --- a/ilastik/applets/base/appletSerializer/legacyClassifiers.py +++ b/ilastik/applets/base/appletSerializer/legacyClassifiers.py @@ -113,14 +113,12 @@ def deserialize_classifier_type(ds: h5py.Dataset) -> LazyflowClassifierTypeABC: \n p\d+ \n - \. # all pickles end in "." STOP - $ """, re.X, ) # legacy support - ilastik used to pickle the classifier type - if class_string.isascii() and (m := classifier_pickle_string_matcher.match(class_string)): + if class_string.isascii() and (m := classifier_pickle_string_matcher.search(class_string)): m if m["submodule_name"] not in _lazyflow_classifier_factory_submodule_allow_list: diff --git a/ilastik/applets/base/appletSerializer/serializerUtils.py b/ilastik/applets/base/appletSerializer/serializerUtils.py index 29e15a8867..dfbf5cb768 100644 --- a/ilastik/applets/base/appletSerializer/serializerUtils.py +++ b/ilastik/applets/base/appletSerializer/serializerUtils.py @@ -21,6 +21,7 @@ from typing import Sequence, Tuple, Union import h5py +import numpy def deleteIfPresent(parentGroup: h5py.Group, name: str) -> None: @@ -86,4 +87,8 @@ def stringToSlicing(strSlicing: Union[bytes, str]) -> Tuple[slice, ...]: def deserialize_string_from_h5(ds: h5py.Dataset): - return ds[()].decode() + data = ds[()] + if isinstance(data, numpy.void): + data = data.tobytes() + + return data.decode() diff --git a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py index 51070dc5b3..6b67473bbd 100644 --- a/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py +++ b/tests/test_ilastik/test_applets/base/test_legacyClassifierDeserialiers.py @@ -21,6 +21,7 @@ import pickle import h5py +import numpy import pytest from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier @@ -35,14 +36,14 @@ _deserialize_classifier_factory_impl, _deserialize_classifier_factory_type, _deserialize_ParallelVigraRfLazyflowClassifierFactory, + _deserialize_sklearn_classifier, _deserialize_SklearnLazyflowClassifierFactory, _deserialize_VigraRfClassifierFactory, - _deserialize_sklearn_classifier, deserialize_classifier_factory, deserialize_classifier_type, ) from lazyflow.classifiers.parallelVigraRfLazyflowClassifier import ParallelVigraRfLazyflowClassifierFactory -from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifierFactory +from lazyflow.classifiers.sklearnLazyflowClassifier import SklearnLazyflowClassifier, SklearnLazyflowClassifierFactory from lazyflow.classifiers.vigraRfLazyflowClassifier import VigraRfLazyflowClassifier, VigraRfLazyflowClassifierFactory @@ -135,6 +136,20 @@ def test_sklearn_lazyflow_classifier_pickled_deserialization( assert deserialized_info == expected_info +def test_sklearn_deserialization_from_project_file(empty_in_memory_project_file): + """Ensure loading of sklearn classifiers as saved in ilastik""" + + classifier = SklearnLazyflowClassifier( + KNeighborsClassifier(), known_classes=3, feature_count=3, feature_names=["a", "b", "c"] + ) + classifier_bytes = pickle.dumps(classifier, 0) + # note that sklearn classifiers are saved wrapped in numpy.void + ds = empty_in_memory_project_file.create_dataset("classifier_type", data=numpy.void(classifier_bytes)) + + classifier_type = deserialize_classifier_type(ds) + assert issubclass(classifier_type, SklearnLazyflowClassifier) + + @pytest.mark.parametrize( "classifier_type", [ @@ -207,7 +222,6 @@ def test_sklearn_lazyflow_classifier_pickled_deserialization_raises( ], ) def test_deserialize_SklearnLazyflowClassifierFactory(classifier_type, c_args, c_kwargs, expected_info): - assert True pickled_classifier = pickle.dumps( SklearnLazyflowClassifierFactory(classifier_type, *c_args, **c_kwargs), 0 ).decode() diff --git a/tests/test_ilastik/test_applets/base/test_serializerUtils.py b/tests/test_ilastik/test_applets/base/test_serializerUtils.py index 23a159600b..982400a364 100644 --- a/tests/test_ilastik/test_applets/base/test_serializerUtils.py +++ b/tests/test_ilastik/test_applets/base/test_serializerUtils.py @@ -21,6 +21,7 @@ from typing import Sequence, Tuple, Union import h5py +import numpy import pytest from ilastik.applets.base.appletSerializer.serializerUtils import ( @@ -128,3 +129,10 @@ def test_deserialize_string_from_h5(empty_in_memory_project_file: h5py.File): ds = empty_in_memory_project_file.create_dataset("test", data=test_string.encode("utf-8")) assert deserialize_string_from_h5(ds) == test_string + + +def test_deserialize_void_wrapped_string_from_h5(empty_in_memory_project_file: h5py.File): + test_string = "this is a another test string" + ds = empty_in_memory_project_file.create_dataset("test", data=numpy.void(test_string.encode("utf-8"))) + + assert deserialize_string_from_h5(ds) == test_string