From 8b7788b1b1d57de609147f7f21ac3e347294f341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Thu, 17 Jul 2025 22:07:07 +0200 Subject: [PATCH 1/2] add testing for builtin_types --- tests/test_process_file.py | 39 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/tests/test_process_file.py b/tests/test_process_file.py index ebc4ff0..920ba29 100644 --- a/tests/test_process_file.py +++ b/tests/test_process_file.py @@ -27,13 +27,16 @@ def test_corrupted_pass(): assert len(tags) == 69 +@pytest.mark.parametrize("builtin_types", (True, False)) @pytest.mark.parametrize( "stop_tag, tag_count", (("ColorSpace", 39), (DEFAULT_STOP_TAG, 51)) ) -def test_stop_at_tag(stop_tag, tag_count): +def test_stop_at_tag(builtin_types, stop_tag, tag_count): file_path = RESOURCES_ROOT / "jpg/Canon_40D.jpg" with open(file_path, "rb") as fh: - tags = exifread.process_file(fh=fh, stop_tag=stop_tag) + tags = exifread.process_file( + fh=fh, stop_tag=stop_tag, builtin_types=builtin_types + ) assert len(tags) == tag_count @@ -118,3 +121,35 @@ def test_stop_tag_with_thumbnail_extract(): with open(file_path, "rb") as fh: tags = exifread.process_file(fh=fh, details=False, stop_tag="Orientation") assert tags + + +@pytest.mark.parametrize("details", (True, False)) +@pytest.mark.parametrize("truncate_tags", (True, False)) +@pytest.mark.parametrize("stop_tag", ("WhiteBalance", DEFAULT_STOP_TAG)) +def test_builtin_types(stop_tag, details, truncate_tags): + """ + When ``builtin_types=True``, always convert to Python types. + Test with various other options to make sure they don't interfere. + The "WhiteBalance" tag is after al the tags tested so must not have an impact. + """ + file_path = RESOURCES_ROOT / "jpg/Canon_DIGITAL_IXUS_400.jpg" + with open(file_path, "rb") as fh: + tags = exifread.process_file( + fh=fh, + builtin_types=True, + stop_tag=stop_tag, + details=details, + truncate_tags=truncate_tags, + ) + # Short mapped to string value + assert tags["EXIF ColorSpace"] == "sRGB" + # Short + assert isinstance(tags["EXIF ExifImageLength"], int) + assert tags["EXIF ExifImageLength"] == 75 + # Ratio + assert isinstance(tags["EXIF ExposureTime"], float) + assert tags["EXIF ExposureTime"] == 0.005 + # ASCII + assert tags["Image Make"] == "Canon" + # Unknown / Undefined + assert tags["EXIF FlashPixVersion"] == "0100" From 79bc5bd9610911f64a953c803a840d3227b17415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 18 Jul 2025 22:26:27 +0200 Subject: [PATCH 2/2] document main function, refactor xmp stuff --- README.rst | 4 ++- exifread/__init__.py | 51 +++++++++++++------------------- exifread/core/exif_header.py | 22 ++------------ exifread/core/xmp.py | 57 ++++++++++++++++++++++++++++++++++++ tests/test_process_file.py | 12 ++++++++ 5 files changed, 95 insertions(+), 51 deletions(-) create mode 100644 exifread/core/xmp.py diff --git a/README.rst b/README.rst index 12534b3..11e21fe 100644 --- a/README.rst +++ b/README.rst @@ -198,7 +198,9 @@ For direct JSON serialization, combine this option with ``details=False`` to avo .. code-block:: python - json.dumps(exifread.process_file(file_handle, details=False, builtin_types=True)) + json.dumps( + exifread.process_file(file_handle, details=False, builtin_types=True) + ) Usage Example ============= diff --git a/exifread/__init__.py b/exifread/__init__.py index 30834ed..7a0eb98 100644 --- a/exifread/__init__.py +++ b/exifread/__init__.py @@ -8,6 +8,7 @@ from exifread.core.exceptions import ExifNotFound, InvalidExif from exifread.core.exif_header import ExifHeader from exifread.core.find_exif import determine_type, get_endian_str +from exifread.core.xmp import find_xmp_data from exifread.exif_log import get_logger from exifread.serialize import convert_types from exifread.tags import DEFAULT_STOP_TAG @@ -17,36 +18,9 @@ logger = get_logger() -def _get_xmp(fh: BinaryIO) -> bytes: - xmp_bytes = b"" - logger.debug("XMP not in Exif, searching file for XMP info...") - xml_started = False - xml_finished = False - for line in fh: - open_tag = line.find(b"") - if open_tag != -1: - xml_started = True - line = line[open_tag:] - logger.debug("XMP found opening tag at line position %s", open_tag) - if close_tag != -1: - logger.debug("XMP found closing tag at line position %s", close_tag) - line_offset = 0 - if open_tag != -1: - line_offset = open_tag - line = line[: (close_tag - line_offset) + 12] - xml_finished = True - if xml_started: - xmp_bytes += line - if xml_finished: - break - logger.debug("XMP Finished searching for info") - return xmp_bytes - - def process_file( fh: BinaryIO, - stop_tag=DEFAULT_STOP_TAG, + stop_tag: str = DEFAULT_STOP_TAG, details=True, strict=False, debug=False, @@ -56,10 +30,27 @@ def process_file( builtin_types=False, ) -> Dict[str, Any]: """ - Process an image file (expects an open file object). + Process an image file to extract EXIF metadata. This is the function that has to deal with all the arbitrary nasty bits of the EXIF standard. + + :param fh: the file to process, must be opened in binary mode. + :param stop_tag: Stop processing when the given tag is retrieved. + :param details: If `True`, process MakerNotes. + :param strict: If `True`, raise exceptions on errors. + :param debug: Output debug information. + :param truncate_tags: If `True`, truncate the `printable` tag output. + There is no effect on tag `values`. + :param auto_seek: If `True`, automatically `seek` to the start of the file. + :param extract_thumbnail: If `True`, extract the JPEG thumbnail. + The thumbnail is not always present in the EXIF metadata. + :param builtin_types: If `True`, convert tags to standard Python types. + + :returns: A `dict` containing the EXIF metadata. + The keys are a string in the format `"IFD_NAME TAG_NAME"`. + If `builtin_types` is `False`, the value will be a `IfdTag` class, or bytes. + IF `builtin_types` is `True`, the value will be a standard Python type. """ if auto_seek: @@ -121,7 +112,7 @@ def process_file( xmp_bytes = bytes(xmp_tag.values) # We need to look in the entire file for the XML else: - xmp_bytes = _get_xmp(fh) + xmp_bytes = find_xmp_data(fh) if xmp_bytes: hdr.parse_xmp(xmp_bytes) diff --git a/exifread/core/exif_header.py b/exifread/core/exif_header.py index 24da23b..0a6aca5 100644 --- a/exifread/core/exif_header.py +++ b/exifread/core/exif_header.py @@ -5,11 +5,10 @@ import re import struct from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union -from xml.dom.minidom import parseString -from xml.parsers.expat import ExpatError from exifread.core.exceptions import ExifError from exifread.core.ifd_tag import IfdTag +from exifread.core.xmp import xmp_bytes_to_str from exifread.exif_log import get_logger from exifread.tags import ( DEFAULT_STOP_TAG, @@ -700,23 +699,6 @@ def parse_xmp(self, xmp_bytes: bytes): """Adobe's Extensible Metadata Platform, just dump the pretty XML.""" logger.debug("XMP cleaning data") - - # Pray that it's encoded in UTF-8 - # TODO: allow user to specify encoding - xmp_string = xmp_bytes.decode("utf-8") - - try: - pretty = parseString(xmp_string).toprettyxml() - except ExpatError: - logger.warning("XMP: XML is not well formed") - self.tags["Image ApplicationNotes"] = IfdTag( - xmp_string, 0, FieldType.BYTE, xmp_bytes, 0, 0 - ) - return - cleaned = [] - for line in pretty.splitlines(): - if line.strip(): - cleaned.append(line) self.tags["Image ApplicationNotes"] = IfdTag( - "\n".join(cleaned), 0, FieldType.BYTE, xmp_bytes, 0, 0 + xmp_bytes_to_str(xmp_bytes), 0, FieldType.BYTE, xmp_bytes, 0, 0 ) diff --git a/exifread/core/xmp.py b/exifread/core/xmp.py new file mode 100644 index 0000000..db301cb --- /dev/null +++ b/exifread/core/xmp.py @@ -0,0 +1,57 @@ +"""XMP related utilities..""" + +from pyexpat import ExpatError +from typing import BinaryIO +from xml.dom.minidom import parseString + +from exifread.exif_log import get_logger + +logger = get_logger() + + +def find_xmp_data(fh: BinaryIO) -> bytes: + xmp_bytes = b"" + logger.debug("XMP not in Exif, searching file for XMP info...") + xml_started = False + xml_finished = False + for line in fh: + open_tag = line.find(b"") + if open_tag != -1: + xml_started = True + line = line[open_tag:] + logger.debug("XMP found opening tag at line position %s", open_tag) + if close_tag != -1: + logger.debug("XMP found closing tag at line position %s", close_tag) + line_offset = 0 + if open_tag != -1: + line_offset = open_tag + line = line[: (close_tag - line_offset) + 12] + xml_finished = True + if xml_started: + xmp_bytes += line + if xml_finished: + break + logger.debug("Found %s XMP bytes", len(xmp_bytes)) + return xmp_bytes + + +def xmp_bytes_to_str(xmp_bytes: bytes) -> str: + """Adobe's Extensible Metadata Platform, just dump the pretty XML.""" + + logger.debug("Cleaning XMP data ...") + + # Pray that it's encoded in UTF-8 + # TODO: allow user to specify encoding + xmp_string = xmp_bytes.decode("utf-8") + + try: + pretty = parseString(xmp_string).toprettyxml() + except ExpatError: + logger.warning("XMP: XML is not well formed") + return xmp_string + cleaned = [] + for line in pretty.splitlines(): + if line.strip(): + cleaned.append(line) + return "\n".join(cleaned) diff --git a/tests/test_process_file.py b/tests/test_process_file.py index 920ba29..713afaf 100644 --- a/tests/test_process_file.py +++ b/tests/test_process_file.py @@ -153,3 +153,15 @@ def test_builtin_types(stop_tag, details, truncate_tags): assert tags["Image Make"] == "Canon" # Unknown / Undefined assert tags["EXIF FlashPixVersion"] == "0100" + + +def test_xmp_no_tag(): + """Read XMP data not in an Exif tag.""" + + file_path = RESOURCES_ROOT / "tiff/Arbitro.tiff" + with open(file_path, "rb") as fh: + tags = exifread.process_file( + fh=fh, + builtin_types=True, + ) + assert len(tags["Image ApplicationNotes"]) == 323