Skip to content

Commit

Permalink
Add fixed resize and pad strategy for object detection (#30742)
Browse files Browse the repository at this point in the history
* Add resize and pad strategy

* Merge get_size functions

* Add pad_size + tests to object detection models

* Fixup

* Update docstrings

* Fixup
  • Loading branch information
qubvel committed May 17, 2024
1 parent e9a8041 commit bf646fb
Show file tree
Hide file tree
Showing 13 changed files with 929 additions and 89 deletions.
8 changes: 7 additions & 1 deletion src/transformers/image_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,13 @@ def center_crop(
)


VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
VALID_SIZE_DICT_KEYS = (
{"height", "width"},
{"shortest_edge"},
{"shortest_edge", "longest_edge"},
{"longest_edge"},
{"max_height", "max_width"},
)


def is_valid_size_dict(size_dict):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)


# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width


# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Expand Down Expand Up @@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Expand All @@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""

model_input_names = ["pixel_values", "pixel_mask"]
Expand All @@ -813,6 +862,7 @@ def __init__(
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
Expand Down Expand Up @@ -846,6 +896,7 @@ def __init__(
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
Expand All @@ -861,6 +912,7 @@ def __init__(
"image_mean",
"image_std",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
Expand Down Expand Up @@ -933,8 +985,15 @@ def resize(
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
Expand All @@ -953,18 +1012,27 @@ def resize(
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image

Expand Down Expand Up @@ -1108,6 +1176,7 @@ def pad(
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
Expand Down Expand Up @@ -1137,16 +1206,24 @@ def pad(
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)

annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
Expand All @@ -1160,7 +1237,7 @@ def pad(

if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
Expand Down Expand Up @@ -1195,6 +1272,7 @@ def preprocess(
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
Expand Down Expand Up @@ -1222,7 +1300,15 @@ def preprocess(
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
Expand All @@ -1240,8 +1326,9 @@ def preprocess(
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
Expand All @@ -1257,6 +1344,10 @@ def preprocess(
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
Expand Down Expand Up @@ -1286,6 +1377,7 @@ def preprocess(
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format

images = make_list_of_images(images)
Expand Down Expand Up @@ -1410,6 +1502,7 @@ def preprocess(
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [
Expand Down

0 comments on commit bf646fb

Please sign in to comment.