diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py index 9903cb2579..78a26075d1 100644 --- a/keras_hub/api/layers/__init__.py +++ b/keras_hub/api/layers/__init__.py @@ -50,6 +50,9 @@ from keras_hub.src.models.sam.sam_image_converter import SAMImageConverter from keras_hub.src.models.sam.sam_mask_decoder import SAMMaskDecoder from keras_hub.src.models.sam.sam_prompt_encoder import SAMPromptEncoder +from keras_hub.src.models.segformer.segformer_image_converter import ( + SegFormerImageConverter, +) from keras_hub.src.models.vgg.vgg_image_converter import VGGImageConverter from keras_hub.src.models.whisper.whisper_audio_converter import ( WhisperAudioConverter, diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py index 71c8ca9f82..e0e8773a35 100644 --- a/keras_hub/api/models/__init__.py +++ b/keras_hub/api/models/__init__.py @@ -266,6 +266,13 @@ from keras_hub.src.models.sam.sam_image_segmenter_preprocessor import ( SAMImageSegmenterPreprocessor, ) +from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone +from keras_hub.src.models.segformer.segformer_image_segmenter import ( + SegFormerImageSegmenter, +) +from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor import ( + SegFormerImageSegmenterPreprocessor, +) from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import ( diff --git a/keras_hub/src/models/mit/mit_backbone.py b/keras_hub/src/models/mit/mit_backbone.py index 4ac0402fbd..a6c57816c4 100644 --- a/keras_hub/src/models/mit/mit_backbone.py +++ b/keras_hub/src/models/mit/mit_backbone.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import keras import numpy as np from keras import ops @@ -100,7 +111,7 @@ def __init__( ] transformer_blocks.append(transformer_block) cur += depths[i] - layer_norms.append(keras.layers.LayerNormalization()) + layer_norms.append(keras.layers.LayerNormalization(epsilon=1e-5)) # === Functional Model === image_input = keras.layers.Input(shape=image_shape) diff --git a/keras_hub/src/models/mit/mit_backbone_test.py b/keras_hub/src/models/mit/mit_backbone_test.py index 553a266e5b..88c58e96a2 100644 --- a/keras_hub/src/models/mit/mit_backbone_test.py +++ b/keras_hub/src/models/mit/mit_backbone_test.py @@ -9,7 +9,7 @@ class MiTBackboneTest(TestCase): def setUp(self): self.init_kwargs = { "depths": [2, 2], - "image_shape": (16, 16, 3), + "image_shape": (32, 32, 3), "hidden_dims": [4, 8], "num_layers": 2, "blockwise_num_heads": [1, 2], @@ -18,7 +18,7 @@ def setUp(self): "patch_sizes": [7, 3], "strides": [4, 2], } - self.input_size = 16 + self.input_size = 32 self.input_data = np.ones( (2, self.input_size, self.input_size, 3), dtype="float32" ) @@ -28,9 +28,9 @@ def test_backbone_basics(self): cls=MiTBackbone, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output_shape=(2, 2, 2, 8), + expected_output_shape=(2, 4, 4, 8), expected_pyramid_output_keys=["P1", "P2"], - expected_pyramid_image_sizes=[(4, 4), (2, 2)], + expected_pyramid_image_sizes=[(8, 8), (4, 4)], run_quantization_check=False, run_mixed_precision_check=False, run_data_format_check=False, diff --git a/keras_hub/src/models/mit/mit_image_classifier_test.py b/keras_hub/src/models/mit/mit_image_classifier_test.py index 058ea4fbb2..32055c47ed 100644 --- a/keras_hub/src/models/mit/mit_image_classifier_test.py +++ b/keras_hub/src/models/mit/mit_image_classifier_test.py @@ -9,11 +9,11 @@ class MiTImageClassifierTest(TestCase): def setUp(self): # Setup model. - self.images = np.ones((2, 16, 16, 3), dtype="float32") + self.images = np.ones((2, 32, 32, 3), dtype="float32") self.labels = [0, 3] self.backbone = MiTBackbone( depths=[2, 2, 2, 2], - image_shape=(16, 16, 3), + image_shape=(32, 32, 3), hidden_dims=[4, 8], num_layers=2, blockwise_num_heads=[1, 2], @@ -40,7 +40,7 @@ def test_classifier_basics(self): cls=MiTImageClassifier, init_kwargs=self.init_kwargs, train_data=self.train_data, - expected_output_shape=(2, 2), + expected_output_shape=(4, 4), ) @pytest.mark.large diff --git a/keras_hub/src/models/mit/mit_layers.py b/keras_hub/src/models/mit/mit_layers.py index fc5180ca90..b949fcb6e2 100644 --- a/keras_hub/src/models/mit/mit_layers.py +++ b/keras_hub/src/models/mit/mit_layers.py @@ -183,20 +183,21 @@ def __init__(self, project_dim, num_heads, sr_ratio): self.k = keras.layers.Dense(project_dim) self.v = keras.layers.Dense(project_dim) self.proj = keras.layers.Dense(project_dim) + self.dropout = keras.layers.Dropout(0.1) + self.proj_drop = keras.layers.Dropout(0.1) if sr_ratio > 1: self.sr = keras.layers.Conv2D( filters=project_dim, kernel_size=sr_ratio, strides=sr_ratio, - padding="same", ) - self.norm = keras.layers.LayerNormalization() + self.norm = keras.layers.LayerNormalization(epsilon=1e-5) def call(self, x): input_shape = ops.shape(x) H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1])) - B, C = input_shape[0], input_shape[2] + B, N, C = input_shape[0], input_shape[1], input_shape[2] q = self.q(x) q = ops.reshape( @@ -212,12 +213,11 @@ def call(self, x): if self.sr_ratio > 1: x = ops.reshape( - ops.transpose(x, [0, 2, 1]), + x, (B, H, W, C), ) x = self.sr(x) - x = ops.reshape(x, [input_shape[0], input_shape[2], -1]) - x = ops.transpose(x, [0, 2, 1]) + x = ops.reshape(x, [B, -1, C]) x = self.norm(x) k = self.k(x) @@ -241,14 +241,16 @@ def call(self, x): attn = (q @ ops.transpose(k, [0, 1, 3, 2])) * self.scale attn = ops.nn.softmax(attn, axis=-1) + attn = self.dropout(attn) attn = attn @ v attn = ops.reshape( ops.transpose(attn, [0, 2, 1, 3]), - [input_shape[0], input_shape[1], input_shape[2]], + [B, N, C], ) x = self.proj(attn) + x = self.proj_drop(x) return x diff --git a/keras_hub/src/models/mit/mit_presets.py b/keras_hub/src/models/mit/mit_presets.py index 2408f5529b..9c2a5fe362 100644 --- a/keras_hub/src/models/mit/mit_presets.py +++ b/keras_hub/src/models/mit/mit_presets.py @@ -76,7 +76,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_ade20k_512/1", + "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_ade20k_640/1", }, "mit_b0_cityscapes_1024": { "metadata": { diff --git a/keras_hub/src/models/mit/mix_transformer_backbone_test.py b/keras_hub/src/models/mit/mix_transformer_backbone_test.py index 553a266e5b..88c58e96a2 100644 --- a/keras_hub/src/models/mit/mix_transformer_backbone_test.py +++ b/keras_hub/src/models/mit/mix_transformer_backbone_test.py @@ -9,7 +9,7 @@ class MiTBackboneTest(TestCase): def setUp(self): self.init_kwargs = { "depths": [2, 2], - "image_shape": (16, 16, 3), + "image_shape": (32, 32, 3), "hidden_dims": [4, 8], "num_layers": 2, "blockwise_num_heads": [1, 2], @@ -18,7 +18,7 @@ def setUp(self): "patch_sizes": [7, 3], "strides": [4, 2], } - self.input_size = 16 + self.input_size = 32 self.input_data = np.ones( (2, self.input_size, self.input_size, 3), dtype="float32" ) @@ -28,9 +28,9 @@ def test_backbone_basics(self): cls=MiTBackbone, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output_shape=(2, 2, 2, 8), + expected_output_shape=(2, 4, 4, 8), expected_pyramid_output_keys=["P1", "P2"], - expected_pyramid_image_sizes=[(4, 4), (2, 2)], + expected_pyramid_image_sizes=[(8, 8), (4, 4)], run_quantization_check=False, run_mixed_precision_check=False, run_data_format_check=False, diff --git a/keras_hub/src/models/segformer/__init__.py b/keras_hub/src/models/segformer/__init__.py new file mode 100644 index 0000000000..3a95690dba --- /dev/null +++ b/keras_hub/src/models/segformer/__init__.py @@ -0,0 +1,8 @@ +from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone +from keras_hub.src.models.segformer.segformer_image_segmenter import ( + SegFormerImageSegmenter, +) +from keras_hub.src.models.segformer.segformer_presets import presets +from keras_hub.src.utils.preset_utils import register_presets + +register_presets(presets, SegFormerImageSegmenter) diff --git a/keras_hub/src/models/segformer/segformer_backbone.py b/keras_hub/src/models/segformer/segformer_backbone.py new file mode 100644 index 0000000000..f5563b4c02 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_backbone.py @@ -0,0 +1,163 @@ +import keras + +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.models.backbone import Backbone + + +@keras_hub_export("keras_hub.models.SegFormerBackbone") +class SegFormerBackbone(Backbone): + """A Keras model implementing the SegFormer architecture for semantic segmentation. + + This class implements the majority of the SegFormer architecture described in + [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers] + (https://arxiv.org/abs/2105.15203) and [based on the TensorFlow implementation from DeepVision] + (https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer). + + SegFormers are meant to be used with the MixTransformer (MiT) encoder family, and + and use a very lightweight all-MLP decoder head. + + The MiT encoder uses a hierarchical transformer which outputs features at multiple scales, + similar to that of the hierarchical outputs typically associated with CNNs. + + Args: + image_encoder: `keras.Model`. The backbone network for the model that is + used as a feature extractor for the SegFormer encoder. + Should be used with the MiT backbone model + (`keras_hub.models.MiTBackbone`) which was created + specifically for SegFormers. + num_classes: int, the number of classes for the detection model, + including the background class. + projection_filters: int, number of filters in the + convolution layer projecting the concatenated features into + a segmentation map. Defaults to 256`. + + Example: + + Using the class with a custom `backbone`: + + ```python + import keras_hub + + backbone = keras_hub.models.MiTBackbone( + depths=[2, 2, 2, 2], + image_shape=(224, 224, 3), + hidden_dims=[32, 64, 160, 256], + num_layers=4, + blockwise_num_heads=[1, 2, 5, 8], + blockwise_sr_ratios=[8, 4, 2, 1], + max_drop_path_rate=0.1, + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], + ) + + segformer_backbone = keras_hub.models.SegFormerBackbone(image_encoder=backbone, projection_filters=256) + ``` + + Using the class with a preset `backbone`: + + ```python + import keras_hub + + backbone = keras_hub.models.MiTBackbone.from_preset("mit_b0_ade20k_512") + segformer_backbone = keras_hub.models.SegFormerBackbone(image_encoder=backbone, projection_filters=256) + ``` + + """ + + def __init__( + self, + image_encoder, + projection_filters, + **kwargs, + ): + if not isinstance(image_encoder, keras.layers.Layer) or not isinstance( + image_encoder, keras.Model + ): + raise ValueError( + "Argument `image_encoder` must be a `keras.layers.Layer` instance " + f" or `keras.Model`. Received instead " + f"image_encoder={image_encoder} (of type {type(image_encoder)})." + ) + + # === Layers === + inputs = keras.layers.Input(shape=image_encoder.input.shape[1:]) + + self.feature_extractor = keras.Model( + image_encoder.inputs, image_encoder.pyramid_outputs + ) + + features = self.feature_extractor(inputs) + # Get height and width of level one output + _, height, width, _ = features["P1"].shape + + self.mlp_blocks = [] + + for feature_dim, feature in zip(image_encoder.hidden_dims, features): + self.mlp_blocks.append( + keras.layers.Dense( + projection_filters, name=f"linear_{feature_dim}" + ) + ) + + self.resizing = keras.layers.Resizing( + height, width, interpolation="bilinear" + ) + self.concat = keras.layers.Concatenate(axis=-1) + self.linear_fuse = keras.Sequential( + [ + keras.layers.Conv2D( + filters=projection_filters, kernel_size=1, use_bias=False + ), + keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9), + keras.layers.Activation("relu"), + ] + ) + + # === Functional Model === + # Project all multi-level outputs onto + # the same dimensionality and feature map shape + multi_layer_outs = [] + for index, (feature_dim, feature) in enumerate( + zip(image_encoder.hidden_dims, features) + ): + out = self.mlp_blocks[index](features[feature]) + out = self.resizing(out) + multi_layer_outs.append(out) + + # Concat now-equal feature maps + concatenated_outs = self.concat(multi_layer_outs[::-1]) + + # Fuse concatenated features into a segmentation map + seg = self.linear_fuse(concatenated_outs) + + super().__init__( + inputs=inputs, + outputs=seg, + **kwargs, + ) + + # === Config === + self.projection_filters = projection_filters + self.image_encoder = image_encoder + + def get_config(self): + config = super().get_config() + config.update( + { + "projection_filters": self.projection_filters, + "image_encoder": keras.saving.serialize_keras_object( + self.image_encoder + ), + } + ) + return config + + @classmethod + def from_config(cls, config): + if "image_encoder" in config and isinstance( + config["image_encoder"], dict + ): + config["image_encoder"] = keras.layers.deserialize( + config["image_encoder"] + ) + return super().from_config(config) diff --git a/keras_hub/src/models/segformer/segformer_backbone_tests.py b/keras_hub/src/models/segformer/segformer_backbone_tests.py new file mode 100644 index 0000000000..22133763e7 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_backbone_tests.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest +from keras import ops + +from keras_hub.api.models import MiTBackbone +from keras_hub.api.models import SegFormerBackbone +from keras_hub.src.tests.test_case import TestCase + + +class SegFormerTest(TestCase): + def setUp(self): + image_encoder = MiTBackbone( + depths=[2, 2], + image_shape=(224, 224, 3), + hidden_dims=[32, 64], + num_layers=2, + blockwise_num_heads=[1, 2], + blockwise_sr_ratios=[8, 4], + max_drop_path_rate=0.1, + patch_sizes=[7, 3], + strides=[4, 2], + ) + projection_filters = 256 + self.input_size = 224 + self.input_data = ops.ones((2, self.input_size, self.input_size, 3)) + + self.init_kwargs = { + "projection_filters": projection_filters, + "image_encoder": image_encoder, + } + + def test_segformer_backbone_construction(self): + + SegFormerBackbone( + image_encoder=self.init_kwargs["image_encoder"], + projection_filters=self.init_kwargs["projection_filters"], + ) + + @pytest.mark.large + def test_segformer_call(self): + segformer_backbone = SegFormerBackbone( + image_encoder=self.init_kwargs["image_encoder"], + projection_filters=self.init_kwargs["projection_filters"], + ) + + images = np.random.uniform(size=(2, 224, 224, 3)) + segformer_output = segformer_backbone(images) + segformer_predict = segformer_backbone.predict(images) + + assert segformer_output.shape == (2, 56, 56, 256) + assert segformer_predict.shape == (2, 56, 56, 256) + + def test_backbone_basics(self): + + self.run_vision_backbone_test( + cls=SegFormerBackbone, + init_kwargs={**self.init_kwargs}, + input_data=self.input_data, + expected_output_shape=(2, 56, 56, 256), + ) + + def test_task(self): + self.run_task_test( + cls=SegFormerBackbone, + init_kwargs={**self.init_kwargs}, + train_data=self.input_data, + expected_output_shape=(2, 56, 56, 256), + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=SegFormerBackbone, + init_kwargs={**self.init_kwargs}, + input_data=self.input_data, + ) diff --git a/keras_hub/src/models/segformer/segformer_image_converter.py b/keras_hub/src/models/segformer/segformer_image_converter.py new file mode 100644 index 0000000000..44febd6833 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_image_converter.py @@ -0,0 +1,8 @@ +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.layers.preprocessing.image_converter import ImageConverter +from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone + + +@keras_hub_export("keras_hub.layers.SegFormerImageConverter") +class SegFormerImageConverter(ImageConverter): + backbone_cls = SegFormerBackbone diff --git a/keras_hub/src/models/segformer/segformer_image_segmenter.py b/keras_hub/src/models/segformer/segformer_image_segmenter.py new file mode 100644 index 0000000000..1b00c7a754 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_image_segmenter.py @@ -0,0 +1,171 @@ +import keras + +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.models.image_segmenter import ImageSegmenter +from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone +from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor import ( + SegFormerImageSegmenterPreprocessor, +) + + +@keras_hub_export("keras_hub.models.SegFormerImageSegmenter") +class SegFormerImageSegmenter(ImageSegmenter): + """A Keras model implementing the SegFormer architecture for semantic segmentation. + + This class implements the segmentation head of the SegFormer architecture described in + [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers] + (https://arxiv.org/abs/2105.15203) and [based on the TensorFlow implementation from DeepVision] + (https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer). + + SegFormers are meant to be used with the MixTransformer (MiT) encoder family, and + and use a very lightweight all-MLP decoder head. + + The MiT encoder uses a hierarchical transformer which outputs features at multiple scales, + similar to that of the hierarchical outputs typically associated with CNNs. + + Args: + image_encoder: `keras.Model`. The backbone network for the model that is + used as a feature extractor for the SegFormer encoder. + It is *intended* to be used only with the MiT backbone model + (`keras_hub.models.MiTBackbone`) which was created + specifically for SegFormers. + Alternatively, can be a `keras_hub.models.Backbone` a model subclassing + `keras_hub.models.FeaturePyramidBackbone`, or a `keras.Model` + that has a `pyramid_outputs` property which is + a dictionary with keys "P2", "P3", "P4", and "P5" and layer names as values. + num_classes: int, the number of classes for the detection model, + including the background class. + projection_filters: int, number of filters in the + convolution layer projecting the concatenated features into + a segmentation map. Defaults to 256`. + + + Example: + + Using presets: + + ```python + import keras_hub + import numpy as np + + segmenter = keras_hub.models.SegFormerImageSegmenter.from_preset("segformer_b0_ade20k_512") + + images = np.random.rand(1, 512, 512, 3) + segformer(images) + ``` + + Using the SegFormer backbone: + + ```python + encoder = keras_hub.models.MiTBackbone.from_preset("mit_b0_ade20k_512") + backbone = keras_hub.models.SegFormerBackbone(image_encoder=encoder, projection_filters=256) + ``` + + Using the SegFormer backbone with a custom encoder: + + ```python + import keras + import keras_hub + import numpy as np + + images = np.ones(shape=(1, 96, 96, 3)) + labels = np.zeros(shape=(1, 96, 96, 1)) + + encoder = keras_hub.models.MiTBackbone( + depths=[2, 2, 2, 2], + image_shape=(96, 96, 3), + hidden_dims=[32, 64, 160, 256], + num_layers=4, + blockwise_num_heads=[1, 2, 5, 8], + blockwise_sr_ratios=[8, 4, 2, 1], + max_drop_path_rate=0.1, + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], + ) + + backbone = keras_hub.models.SegFormerBackbone(image_encoder=encoder, projection_filters=256) + segformer = keras_hub.models.SegFormerImageSegmenter(backbone=backbone, num_classes=4) + + segformer(images) + ``` + + Using the segmentor class with a preset backbone: + + ```python + import keras_hub + + image_encoder = keras_hub.models.MiTBackbone.from_preset("mit_b0_ade20k_512") + backbone = keras_hub.models.SegFormerBackbone(image_encoder=encoder, projection_filters=256) + segformer = keras_hub.models.SegFormerImageSegmenter(backbone=backbone, num_classes=4) + ``` + """ + + backbone_cls = SegFormerBackbone + preprocessor_cls = SegFormerImageSegmenterPreprocessor + + def __init__( + self, + backbone, + num_classes, + preprocessor=None, + **kwargs, + ): + if not isinstance(backbone, keras.layers.Layer) or not isinstance( + backbone, keras.Model + ): + raise ValueError( + "Argument `backbone` must be a `keras.layers.Layer` instance " + f" or `keras.Model`. Received instead " + f"backbone={backbone} (of type {type(backbone)})." + ) + + # === Layers === + inputs = backbone.input + + self.backbone = backbone + self.preprocessor = preprocessor + self.dropout = keras.layers.Dropout(0.1) + self.output_segmentation_head = keras.layers.Conv2D( + filters=num_classes, kernel_size=1, strides=1 + ) + self.resizing = keras.layers.Resizing( + height=inputs.shape[1], + width=inputs.shape[2], + interpolation="bilinear", + ) + + # === Functional Model === + x = self.backbone(inputs) + x = self.dropout(x) + x = self.output_segmentation_head(x) + output = self.resizing(x) + + super().__init__( + inputs=inputs, + outputs=output, + **kwargs, + ) + + # === Config === + self.num_classes = num_classes + self.backbone = backbone + + def get_config(self): + config = super().get_config() + config.update( + { + "num_classes": self.num_classes, + "backbone": keras.saving.serialize_keras_object(self.backbone), + } + ) + return config + + @classmethod + def from_config(cls, config): + if "image_encoder" in config and isinstance( + config["image_encoder"], dict + ): + config["image_encoder"] = keras.layers.deserialize( + config["image_encoder"] + ) + return super().from_config(config) diff --git a/keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py b/keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py new file mode 100644 index 0000000000..fd8c5fba35 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py @@ -0,0 +1,31 @@ +import keras + +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.models.image_segmenter_preprocessor import ( + ImageSegmenterPreprocessor, +) +from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone +from keras_hub.src.models.segformer.segformer_image_converter import ( + SegFormerImageConverter, +) +from keras_hub.src.utils.tensor_utils import preprocessing_function + +IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] +IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225] + + +@keras_hub_export("keras_hub.models.SegFormerImageSegmenterPreprocessor") +class SegFormerImageSegmenterPreprocessor(ImageSegmenterPreprocessor): + backbone_cls = SegFormerBackbone + image_converter_cls = SegFormerImageConverter + + @preprocessing_function + def call(self, x, y=None, sample_weight=None): + if self.image_converter: + x = self.image_converter(x) + y = self.image_converter(y) + + x = x / 255 + x = (x - IMAGENET_DEFAULT_MEAN) / IMAGENET_DEFAULT_STD + + return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) diff --git a/keras_hub/src/models/segformer/segformer_image_segmenter_tests.py b/keras_hub/src/models/segformer/segformer_image_segmenter_tests.py new file mode 100644 index 0000000000..4ad2e8bc6f --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_image_segmenter_tests.py @@ -0,0 +1,65 @@ +import numpy as np +import pytest +from keras import ops + +from keras_hub.api.models import MiTBackbone +from keras_hub.api.models import SegFormerBackbone +from keras_hub.api.models import SegFormerImageSegmenter +from keras_hub.src.tests.test_case import TestCase + + +class SegFormerTest(TestCase): + def setUp(self): + image_encoder = MiTBackbone( + depths=[2, 2], + image_shape=(224, 224, 3), + hidden_dims=[32, 64], + num_layers=2, + blockwise_num_heads=[1, 2], + blockwise_sr_ratios=[8, 4], + max_drop_path_rate=0.1, + patch_sizes=[7, 3], + strides=[4, 2], + ) + projection_filters = 256 + self.backbone = SegFormerBackbone( + image_encoder=image_encoder, projection_filters=projection_filters + ) + + self.input_size = 224 + self.input_data = ops.ones((2, self.input_size, self.input_size, 3)) + + self.init_kwargs = {"backbone": self.backbone, "num_classes": 4} + + def test_segformer_segmenter_construction(self): + SegFormerImageSegmenter(backbone=self.backbone, num_classes=4) + + @pytest.mark.large + def test_segformer_call(self): + + segformer = SegFormerImageSegmenter( + backbone=self.backbone, num_classes=4 + ) + + images = np.random.uniform(size=(2, 224, 224, 4)) + segformer_output = segformer(images) + segformer_predict = segformer.predict(images) + + assert segformer_output.shape == images.shape + assert segformer_predict.shape == images.shape + + def test_task(self): + self.run_task_test( + cls=SegFormerImageSegmenter, + init_kwargs={**self.init_kwargs}, + train_data=self.input_data, + expected_output_shape=(2, 224, 224), + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=SegFormerImageSegmenter, + init_kwargs={**self.init_kwargs}, + input_data=self.input_data, + ) diff --git a/keras_hub/src/models/segformer/segformer_presets.py b/keras_hub/src/models/segformer/segformer_presets.py new file mode 100644 index 0000000000..2c0fff0a50 --- /dev/null +++ b/keras_hub/src/models/segformer/segformer_presets.py @@ -0,0 +1,136 @@ +"""SegFormer model preset configurations.""" + +presets = { + "segformer_b0_ade20k_512": { + "metadata": { + "description": ( + "SegFormer model with MiTB0 backbone fine-tuned on ADE20k in 512x512 resolution." + ), + "params": 3719027, + "official_name": "SegFormerB0", + "path": "segformer_b0", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b0_ade20k_512", + }, + "segformer_b1_ade20k_512": { + "metadata": { + "description": ( + "SegFormer model with MiTB1 backbone fine-tuned on ADE20k in 512x512 resolution." + ), + "params": 13682643, + "official_name": "SegFormerB1", + "path": "segformer_b1", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b1_ade20k_512", + }, + "segformer_b2_ade20k_512": { + "metadata": { + "description": ( + "SegFormer model with MiTB2 backbone fine-tuned on ADE20k in 512x512 resolution." + ), + "params": 24727507, + "official_name": "SegFormerB2", + "path": "segformer_b2", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b2_ade20k_512", + }, + "segformer_b3_ade20k_512": { + "metadata": { + "description": ( + "SegFormer model with MiTB3 backbone fine-tuned on ADE20k in 512x512 resolution." + ), + "params": 44603347, + "official_name": "SegFormerB3", + "path": "segformer_b3", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b3_ade20k_512", + }, + "segformer_b4_ade20k_512": { + "metadata": { + "description": ( + "SegFormer model with MiTB4 backbone fine-tuned on ADE20k in 512x512 resolution." + ), + "params": 61373907, + "official_name": "SegFormerB4", + "path": "segformer_b4", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b4_ade20k_512", + }, + "segformer_b5_ade20k_640": { + "metadata": { + "description": ( + "SegFormer model with MiTB5 backbone fine-tuned on ADE20k in 640x640 resolution." + ), + "params": 81974227, + "official_name": "SegFormerB5", + "path": "segformer_b5", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b5_ade20k_640", + }, + "segformer_b0_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB0 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 3719027, + "official_name": "SegFormerB0", + "path": "segformer_b0", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b0_cityscapes_1024", + }, + "segformer_b1_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB1 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 13682643, + "official_name": "SegFormerB1", + "path": "segformer_b1", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b1_ade20k_512", + }, + "segformer_b2_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB2 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 24727507, + "official_name": "SegFormerB2", + "path": "segformer_b2", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b2_cityscapes_1024", + }, + "segformer_b3_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB3 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 44603347, + "official_name": "SegFormerB3", + "path": "segformer_b3", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b3_cityscapes_1024", + }, + "segformer_b4_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB4 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 61373907, + "official_name": "SegFormerB4", + "path": "segformer_b4", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b4_cityscapes_1024", + }, + "segformer_b5_cityscapes_1024": { + "metadata": { + "description": ( + "SegFormer model with MiTB5 backbone fine-tuned on Cityscapes in 1024x1024 resolution." + ), + "params": 81974227, + "official_name": "SegFormerB5", + "path": "segformer_b5", + }, + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b5_cityscapes_1024", + }, +} diff --git a/tools/checkpoint_conversion/convert_segformer_checkpoints.py b/tools/checkpoint_conversion/convert_segformer_checkpoints.py new file mode 100644 index 0000000000..230cf5227d --- /dev/null +++ b/tools/checkpoint_conversion/convert_segformer_checkpoints.py @@ -0,0 +1,143 @@ +# Usage example +# python tools/checkpoint_conversion/convert_mix_transformer.py --preset "B0_ade_512" + +import numpy as np +from absl import app +from absl import flags +from transformers import SegformerForSemanticSegmentation + +import keras_hub +from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor import ( + SegFormerImageSegmenterPreprocessor, +) + +FLAGS = flags.FLAGS + +PROJECTION_FILTERS = { + "b0_ade20k_512": 256, + "b1_ade20k_512": 256, + "b2_ade20k_512": 768, + "b3_ade20k_512": 768, + "b4_ade20k_512": 768, + "b5_ade20k_640": 768, + "b0_cityscapes_1024": 256, + "b1_cityscapes_1024": 256, + "b2_cityscapes_1024": 768, + "b3_cityscapes_1024": 768, + "b4_cityscapes_1024": 768, + "b5_cityscapes_1024": 768, +} + + +DOWNLOAD_URLS = { + "b0_ade20k_512": "nvidia/segformer-b0-finetuned-ade-512-512", + "b1_ade20k_512": "nvidia/segformer-b1-finetuned-ade-512-512", + "b2_ade20k_512": "nvidia/segformer-b2-finetuned-ade-512-512", + "b3_ade20k_512": "nvidia/segformer-b3-finetuned-ade-512-512", + "b4_ade20k_512": "nvidia/segformer-b4-finetuned-ade-512-512", + "b5_ade20k_640": "nvidia/segformer-b5-finetuned-ade-640-640", + "b0_cityscapes_1024": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1_cityscapes_1024": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2_cityscapes_1024": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3_cityscapes_1024": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4_cityscapes_1024": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5_cityscapes_1024": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} + +flags.DEFINE_string( + "preset", None, f'Must be one of {",".join(DOWNLOAD_URLS.keys())}' +) + + +def set_conv_weights(conv_layer, state_dict): + conv_weights = state_dict["weight"].numpy().transpose(2, 3, 1, 0) + bias = None + if "bias" in state_dict.keys(): + bias = state_dict["bias"].numpy() + conv_layer.set_weights([conv_weights, bias]) + else: + conv_layer.set_weights([conv_weights]) + + +def set_dense_weights(dense_layer, state_dict): + weight = state_dict["weight"].numpy().T + bias = state_dict["bias"].numpy() + dense_layer.set_weights([weight, bias]) + + +def set_batchnorm_weights(bn_layer, state_dict): + gamma = state_dict["weight"].numpy() + beta = state_dict["bias"].numpy() + running_mean = state_dict["running_mean"].numpy() + running_var = state_dict["running_var"].numpy() + + bn_layer.set_weights([gamma, beta, running_mean, running_var]) + + +def main(_): + print("\n-> Loading HuggingFace model") + original_segformer = SegformerForSemanticSegmentation.from_pretrained( + DOWNLOAD_URLS[FLAGS.preset] + ) + + print("\n-> Instantiating KerasHub Model") + + resolution = int(FLAGS.preset.split("_")[-1]) + + encoder = keras_hub.models.MiTBackbone.from_preset( + "mit_" + FLAGS.preset, image_shape=(resolution, resolution, 3) + ) + segformer_backbone = keras_hub.models.SegFormerBackbone( + image_encoder=encoder, + projection_filters=PROJECTION_FILTERS[FLAGS.preset], + ) + num_classes = 150 if "ade20k" in FLAGS.preset else 19 + + preprocessor = SegFormerImageSegmenterPreprocessor() + segformer_segmenter = keras_hub.models.SegFormerImageSegmenter( + backbone=segformer_backbone, + num_classes=num_classes, + preprocessor=preprocessor, + ) + segformer_backbone(np.random.rand(1, resolution, resolution, 3)) + + set_dense_weights( + segformer_backbone.layers[5], + original_segformer.decode_head.linear_c[0].proj.state_dict(), + ) + set_dense_weights( + segformer_backbone.layers[4], + original_segformer.decode_head.linear_c[1].proj.state_dict(), + ) + set_dense_weights( + segformer_backbone.layers[3], + original_segformer.decode_head.linear_c[2].proj.state_dict(), + ) + set_dense_weights( + segformer_backbone.layers[2], + original_segformer.decode_head.linear_c[3].proj.state_dict(), + ) + set_conv_weights( + segformer_backbone.layers[-1].layers[0], + original_segformer.decode_head.linear_fuse.state_dict(), + ) + set_batchnorm_weights( + segformer_backbone.layers[-1].layers[1], + original_segformer.decode_head.batch_norm.state_dict(), + ) + + set_conv_weights( + segformer_segmenter.layers[-2], + original_segformer.decode_head.classifier.state_dict(), + ) + + print("\n-> Converting weights...") + + directory = f"SegFormer_{FLAGS.preset}" + print(f"\n-> Saving converted KerasHub model in {directory}") + segformer_segmenter.save_to_preset(directory) + + +if __name__ == "__main__": + flags.mark_flag_as_required("preset") + app.run(main)