From 8b7788b1b1d57de609147f7f21ac3e347294f341 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= <ianare@gmail.com>
Date: Thu, 17 Jul 2025 22:07:07 +0200
Subject: [PATCH 1/2] add testing for builtin_types

---
 tests/test_process_file.py | 39 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/tests/test_process_file.py b/tests/test_process_file.py
index ebc4ff0..920ba29 100644
--- a/tests/test_process_file.py
+++ b/tests/test_process_file.py
@@ -27,13 +27,16 @@ def test_corrupted_pass():
     assert len(tags) == 69
 
 
+@pytest.mark.parametrize("builtin_types", (True, False))
 @pytest.mark.parametrize(
     "stop_tag, tag_count", (("ColorSpace", 39), (DEFAULT_STOP_TAG, 51))
 )
-def test_stop_at_tag(stop_tag, tag_count):
+def test_stop_at_tag(builtin_types, stop_tag, tag_count):
     file_path = RESOURCES_ROOT / "jpg/Canon_40D.jpg"
     with open(file_path, "rb") as fh:
-        tags = exifread.process_file(fh=fh, stop_tag=stop_tag)
+        tags = exifread.process_file(
+            fh=fh, stop_tag=stop_tag, builtin_types=builtin_types
+        )
     assert len(tags) == tag_count
 
 
@@ -118,3 +121,35 @@ def test_stop_tag_with_thumbnail_extract():
     with open(file_path, "rb") as fh:
         tags = exifread.process_file(fh=fh, details=False, stop_tag="Orientation")
     assert tags
+
+
+@pytest.mark.parametrize("details", (True, False))
+@pytest.mark.parametrize("truncate_tags", (True, False))
+@pytest.mark.parametrize("stop_tag", ("WhiteBalance", DEFAULT_STOP_TAG))
+def test_builtin_types(stop_tag, details, truncate_tags):
+    """
+    When ``builtin_types=True``, always convert to Python types.
+    Test with various other options to make sure they don't interfere.
+    The "WhiteBalance" tag is after al the tags tested so must not have an impact.
+    """
+    file_path = RESOURCES_ROOT / "jpg/Canon_DIGITAL_IXUS_400.jpg"
+    with open(file_path, "rb") as fh:
+        tags = exifread.process_file(
+            fh=fh,
+            builtin_types=True,
+            stop_tag=stop_tag,
+            details=details,
+            truncate_tags=truncate_tags,
+        )
+    # Short mapped to string value
+    assert tags["EXIF ColorSpace"] == "sRGB"
+    # Short
+    assert isinstance(tags["EXIF ExifImageLength"], int)
+    assert tags["EXIF ExifImageLength"] == 75
+    # Ratio
+    assert isinstance(tags["EXIF ExposureTime"], float)
+    assert tags["EXIF ExposureTime"] == 0.005
+    # ASCII
+    assert tags["Image Make"] == "Canon"
+    # Unknown / Undefined
+    assert tags["EXIF FlashPixVersion"] == "0100"

From 79bc5bd9610911f64a953c803a840d3227b17415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= <ianare@gmail.com>
Date: Fri, 18 Jul 2025 22:26:27 +0200
Subject: [PATCH 2/2] document main function, refactor xmp stuff

---
 README.rst                   |  4 ++-
 exifread/__init__.py         | 51 +++++++++++++-------------------
 exifread/core/exif_header.py | 22 ++------------
 exifread/core/xmp.py         | 57 ++++++++++++++++++++++++++++++++++++
 tests/test_process_file.py   | 12 ++++++++
 5 files changed, 95 insertions(+), 51 deletions(-)
 create mode 100644 exifread/core/xmp.py

diff --git a/README.rst b/README.rst
index 12534b3..11e21fe 100644
--- a/README.rst
+++ b/README.rst
@@ -198,7 +198,9 @@ For direct JSON serialization, combine this option with ``details=False`` to avo
 
 .. code-block:: python
 
-    json.dumps(exifread.process_file(file_handle, details=False, builtin_types=True))
+    json.dumps(
+        exifread.process_file(file_handle, details=False, builtin_types=True)
+    )
 
 Usage Example
 =============
diff --git a/exifread/__init__.py b/exifread/__init__.py
index 30834ed..7a0eb98 100644
--- a/exifread/__init__.py
+++ b/exifread/__init__.py
@@ -8,6 +8,7 @@
 from exifread.core.exceptions import ExifNotFound, InvalidExif
 from exifread.core.exif_header import ExifHeader
 from exifread.core.find_exif import determine_type, get_endian_str
+from exifread.core.xmp import find_xmp_data
 from exifread.exif_log import get_logger
 from exifread.serialize import convert_types
 from exifread.tags import DEFAULT_STOP_TAG
@@ -17,36 +18,9 @@
 logger = get_logger()
 
 
-def _get_xmp(fh: BinaryIO) -> bytes:
-    xmp_bytes = b""
-    logger.debug("XMP not in Exif, searching file for XMP info...")
-    xml_started = False
-    xml_finished = False
-    for line in fh:
-        open_tag = line.find(b"<x:xmpmeta")
-        close_tag = line.find(b"</x:xmpmeta>")
-        if open_tag != -1:
-            xml_started = True
-            line = line[open_tag:]
-            logger.debug("XMP found opening tag at line position %s", open_tag)
-        if close_tag != -1:
-            logger.debug("XMP found closing tag at line position %s", close_tag)
-            line_offset = 0
-            if open_tag != -1:
-                line_offset = open_tag
-            line = line[: (close_tag - line_offset) + 12]
-            xml_finished = True
-        if xml_started:
-            xmp_bytes += line
-        if xml_finished:
-            break
-    logger.debug("XMP Finished searching for info")
-    return xmp_bytes
-
-
 def process_file(
     fh: BinaryIO,
-    stop_tag=DEFAULT_STOP_TAG,
+    stop_tag: str = DEFAULT_STOP_TAG,
     details=True,
     strict=False,
     debug=False,
@@ -56,10 +30,27 @@ def process_file(
     builtin_types=False,
 ) -> Dict[str, Any]:
     """
-    Process an image file (expects an open file object).
+    Process an image file to extract EXIF metadata.
 
     This is the function that has to deal with all the arbitrary nasty bits
     of the EXIF standard.
+
+    :param fh: the file to process, must be opened in binary mode.
+    :param stop_tag: Stop processing when the given tag is retrieved.
+    :param details: If `True`, process MakerNotes.
+    :param strict: If `True`, raise exceptions on errors.
+    :param debug: Output debug information.
+    :param truncate_tags: If `True`, truncate the `printable` tag output.
+        There is no effect on tag `values`.
+    :param auto_seek: If `True`, automatically `seek` to the start of the file.
+    :param extract_thumbnail: If `True`, extract the JPEG thumbnail.
+        The thumbnail is not always present in the EXIF metadata.
+    :param builtin_types: If `True`, convert tags to standard Python types.
+
+    :returns: A `dict` containing the EXIF metadata.
+        The keys are a string in the format `"IFD_NAME TAG_NAME"`.
+        If `builtin_types` is `False`, the value will be a `IfdTag` class, or bytes.
+        IF `builtin_types` is `True`, the value will be a standard Python type.
     """
 
     if auto_seek:
@@ -121,7 +112,7 @@ def process_file(
             xmp_bytes = bytes(xmp_tag.values)
         # We need to look in the entire file for the XML
         else:
-            xmp_bytes = _get_xmp(fh)
+            xmp_bytes = find_xmp_data(fh)
         if xmp_bytes:
             hdr.parse_xmp(xmp_bytes)
 
diff --git a/exifread/core/exif_header.py b/exifread/core/exif_header.py
index 24da23b..0a6aca5 100644
--- a/exifread/core/exif_header.py
+++ b/exifread/core/exif_header.py
@@ -5,11 +5,10 @@
 import re
 import struct
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-from xml.dom.minidom import parseString
-from xml.parsers.expat import ExpatError
 
 from exifread.core.exceptions import ExifError
 from exifread.core.ifd_tag import IfdTag
+from exifread.core.xmp import xmp_bytes_to_str
 from exifread.exif_log import get_logger
 from exifread.tags import (
     DEFAULT_STOP_TAG,
@@ -700,23 +699,6 @@ def parse_xmp(self, xmp_bytes: bytes):
         """Adobe's Extensible Metadata Platform, just dump the pretty XML."""
 
         logger.debug("XMP cleaning data")
-
-        # Pray that it's encoded in UTF-8
-        # TODO: allow user to specify encoding
-        xmp_string = xmp_bytes.decode("utf-8")
-
-        try:
-            pretty = parseString(xmp_string).toprettyxml()
-        except ExpatError:
-            logger.warning("XMP: XML is not well formed")
-            self.tags["Image ApplicationNotes"] = IfdTag(
-                xmp_string, 0, FieldType.BYTE, xmp_bytes, 0, 0
-            )
-            return
-        cleaned = []
-        for line in pretty.splitlines():
-            if line.strip():
-                cleaned.append(line)
         self.tags["Image ApplicationNotes"] = IfdTag(
-            "\n".join(cleaned), 0, FieldType.BYTE, xmp_bytes, 0, 0
+            xmp_bytes_to_str(xmp_bytes), 0, FieldType.BYTE, xmp_bytes, 0, 0
         )
diff --git a/exifread/core/xmp.py b/exifread/core/xmp.py
new file mode 100644
index 0000000..db301cb
--- /dev/null
+++ b/exifread/core/xmp.py
@@ -0,0 +1,57 @@
+"""XMP related utilities.."""
+
+from pyexpat import ExpatError
+from typing import BinaryIO
+from xml.dom.minidom import parseString
+
+from exifread.exif_log import get_logger
+
+logger = get_logger()
+
+
+def find_xmp_data(fh: BinaryIO) -> bytes:
+    xmp_bytes = b""
+    logger.debug("XMP not in Exif, searching file for XMP info...")
+    xml_started = False
+    xml_finished = False
+    for line in fh:
+        open_tag = line.find(b"<x:xmpmeta")
+        close_tag = line.find(b"</x:xmpmeta>")
+        if open_tag != -1:
+            xml_started = True
+            line = line[open_tag:]
+            logger.debug("XMP found opening tag at line position %s", open_tag)
+        if close_tag != -1:
+            logger.debug("XMP found closing tag at line position %s", close_tag)
+            line_offset = 0
+            if open_tag != -1:
+                line_offset = open_tag
+            line = line[: (close_tag - line_offset) + 12]
+            xml_finished = True
+        if xml_started:
+            xmp_bytes += line
+        if xml_finished:
+            break
+    logger.debug("Found %s XMP bytes", len(xmp_bytes))
+    return xmp_bytes
+
+
+def xmp_bytes_to_str(xmp_bytes: bytes) -> str:
+    """Adobe's Extensible Metadata Platform, just dump the pretty XML."""
+
+    logger.debug("Cleaning XMP data ...")
+
+    # Pray that it's encoded in UTF-8
+    # TODO: allow user to specify encoding
+    xmp_string = xmp_bytes.decode("utf-8")
+
+    try:
+        pretty = parseString(xmp_string).toprettyxml()
+    except ExpatError:
+        logger.warning("XMP: XML is not well formed")
+        return xmp_string
+    cleaned = []
+    for line in pretty.splitlines():
+        if line.strip():
+            cleaned.append(line)
+    return "\n".join(cleaned)
diff --git a/tests/test_process_file.py b/tests/test_process_file.py
index 920ba29..713afaf 100644
--- a/tests/test_process_file.py
+++ b/tests/test_process_file.py
@@ -153,3 +153,15 @@ def test_builtin_types(stop_tag, details, truncate_tags):
     assert tags["Image Make"] == "Canon"
     # Unknown / Undefined
     assert tags["EXIF FlashPixVersion"] == "0100"
+
+
+def test_xmp_no_tag():
+    """Read XMP data not in an Exif tag."""
+
+    file_path = RESOURCES_ROOT / "tiff/Arbitro.tiff"
+    with open(file_path, "rb") as fh:
+        tags = exifread.process_file(
+            fh=fh,
+            builtin_types=True,
+        )
+    assert len(tags["Image ApplicationNotes"]) == 323