Add ExtractAttachmentsPreprocessor (#1978)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Silvester <steven.silvester@ieee.org>
jupyter · May 6, 2023 · 27804f1 · 27804f1
1 parent b56c272
commit 27804f1
Show file tree

Hide file tree

Showing 10 changed files with 258 additions and 20 deletions.
diff --git a/nbconvert/exporters/exporter.py b/nbconvert/exporters/exporter.py
@@ -95,6 +95,7 @@ class Exporter(LoggingConfigurable):
             "nbconvert.preprocessors.LatexPreprocessor",
             "nbconvert.preprocessors.HighlightMagicsPreprocessor",
             "nbconvert.preprocessors.ExtractOutputPreprocessor",
+            "nbconvert.preprocessors.ExtractAttachmentsPreprocessor",
             "nbconvert.preprocessors.ClearMetadataPreprocessor",
         ],
         help="""List of preprocessors available by default, by name, namespace,

diff --git a/nbconvert/exporters/latex.py b/nbconvert/exporters/latex.py
@@ -54,6 +54,7 @@ def default_config(self):
                         "text/plain",
                     ]
                 },
+                "ExtractAttachmentsPreprocessor": {"enabled": True},
                 "ExtractOutputPreprocessor": {"enabled": True},
                 "SVG2PDFPreprocessor": {"enabled": True},
                 "LatexPreprocessor": {"enabled": True},

diff --git a/nbconvert/exporters/markdown.py b/nbconvert/exporters/markdown.py
@@ -34,6 +34,7 @@ def _raw_mimetypes_default(self):
     def default_config(self):
         c = Config(
             {
+                "ExtractAttachmentsPreprocessor": {"enabled": True},
                 "ExtractOutputPreprocessor": {"enabled": True},
                 "NbConvertBase": {
                     "display_data_priority": [

diff --git a/nbconvert/exporters/pdf.py b/nbconvert/exporters/pdf.py
@@ -210,8 +210,9 @@ def from_notebook_node(self, nb, resources=None, **kw):
         # convert output extension to pdf
         # the writer above required it to be tex
         resources["output_extension"] = ".pdf"
-        # clear figure outputs, extracted by latex export,
+        # clear figure outputs and attachments, extracted by latex export,
         # so we don't claim to be a multi-file export.
         resources.pop("outputs", None)
+        resources.pop("attachments", None)
 
         return pdf_data, resources
diff --git a/nbconvert/preprocessors/__init__.py b/nbconvert/preprocessors/__init__.py
@@ -11,6 +11,7 @@
 from .convertfigures import ConvertFiguresPreprocessor
 from .csshtmlheader import CSSHTMLHeaderPreprocessor
 from .execute import ExecutePreprocessor
+from .extractattachments import ExtractAttachmentsPreprocessor
 from .extractoutput import ExtractOutputPreprocessor
 from .highlightmagics import HighlightMagicsPreprocessor
 from .latex import LatexPreprocessor

diff --git a/nbconvert/preprocessors/extractattachments.py b/nbconvert/preprocessors/extractattachments.py
@@ -0,0 +1,108 @@
+"""
+Module that extracts attachments from notebooks into their own files
+"""
+
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+import os
+from base64 import b64decode
+
+from traitlets import Bool, Unicode
+
+from .base import Preprocessor
+
+
+class ExtractAttachmentsPreprocessor(Preprocessor):
+    """
+    Extracts attachments from all (markdown and raw) cells in a notebook.
+    The extracted attachments are stored in a directory ('attachments' by default).
+    https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
+    """
+
+    attachments_directory_template = Unicode(
+        "{notebook_name}_attachments",
+        help="Directory to place attachments if use_separate_dir is True",
+    ).tag(config=True)
+
+    use_separate_dir = Bool(
+        False,
+        help="Whether to use output_files_dir (which ExtractOutput also uses) or "
+        "create a separate directory for attachments",
+    ).tag(config=True)
+
+    def __init__(self, **kw):
+        """
+        Public constructor
+        """
+        super().__init__(**kw)
+        # directory path,
+        self.path_name = ""  # will be set in self.preprocess, needs resources
+        # Where extracted attachments are stored in resources
+        self.resources_item_key = (
+            "attachments"  # Here as a default, in case someone doesn't want to call preprocess
+        )
+
+    # Add condition and configurability here
+    def preprocess(self, nb, resources):
+        """
+        Determine some settings and apply preprocessor to notebook
+        """
+        if self.use_separate_dir:
+            self.path_name = self.attachments_directory_template.format(
+                notebook_name=resources["unique_key"]
+            )
+            # Initialize resources for attachments
+            resources["attachment_files_dir"] = self.path_name
+            resources["attachments"] = {}
+            self.resources_item_key = "attachments"
+        else:
+            # Use same resources as ExtractOutput
+            self.path_name = resources["output_files_dir"]
+            self.resources_item_key = "outputs"
+
+        # Make sure key exists
+        if not isinstance(resources[self.resources_item_key], dict):
+            resources[self.resources_item_key] = {}
+
+        nb, resources = super().preprocess(nb, resources)
+        return nb, resources
+
+    def preprocess_cell(self, cell, resources, index):
+        """
+        Extract attachments to individual files and
+        change references to them.
+        E.g.
+        '![image.png](attachment:021fdd80.png)'
+        becomes
+        '![image.png]({path_name}/021fdd80.png)'
+        Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
+        """
+        if "attachments" in cell:
+            for fname in cell.attachments:
+                self.log.debug(f"Encountered attachment {fname}")
+
+                # Add file for writer
+
+                # Right now I don't know of a situation where there would be multiple
+                # mime types under same filename, and I can't index into it without the mimetype.
+                # So I only read the first one.
+                for mimetype in cell.attachments[fname]:
+                    # convert to bytes and decode
+                    data = cell.attachments[fname][mimetype].encode("utf-8")
+                    decoded = b64decode(data)
+                    break
+
+                # FilesWriter wants path to be in attachment filename here
+                new_filename = os.path.join(self.path_name, fname)
+                resources[self.resources_item_key][new_filename] = decoded
+
+                # Edit the reference to the attachment
+
+                # os.path.join on windows uses "\\" separator,
+                # but files like markdown still want "/"
+                if os.path.sep != "/":
+                    new_filename = new_filename.replace(os.path.sep, "/")
+                cell.source = cell.source.replace("attachment:" + fname, new_filename)
+
+        return cell, resources
diff --git a/nbconvert/preprocessors/tests/base.py b/nbconvert/preprocessors/tests/base.py
@@ -3,6 +3,8 @@
 # Copyright (c) IPython Development Team.
 # Distributed under the terms of the Modified BSD License.
 
+from base64 import b64encode
+
 from nbformat import v4 as nbformat
 
 from ...exporters.exporter import ResourcesDict
@@ -12,7 +14,7 @@
 class PreprocessorTestsBase(TestsBase):
     """Contains test functions preprocessor tests"""
 
-    def build_notebook(self, with_json_outputs=False):
+    def build_notebook(self, with_json_outputs=False, with_attachment=False):
         """Build a notebook in memory for use with preprocessor tests"""
 
         outputs = [
@@ -42,6 +44,19 @@ def build_notebook(self, with_json_outputs=False):
             nbformat.new_markdown_cell(source="$ e $"),
         ]
 
+        if with_attachment:
+            data = b"test"
+            encoded_data = b64encode(data)
+            # this is conversion of bytes to string, not base64 decoding
+            attachments = {"image.png": {"image/png": encoded_data.decode()}}
+            cells.extend(
+                [
+                    nbformat.new_markdown_cell(
+                        source="![image.png](attachment:image.png)", attachments=attachments
+                    )
+                ]
+            )
+
         return nbformat.new_notebook(cells=cells)
 
     def build_resources(self):

diff --git a/nbconvert/preprocessors/tests/test_extractattachments.py b/nbconvert/preprocessors/tests/test_extractattachments.py
@@ -0,0 +1,87 @@
+"""Tests for the ExtractAttachments preprocessor"""
+
+# Copyright (c) IPython Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+import os
+from base64 import b64decode
+
+from ..extractattachments import ExtractAttachmentsPreprocessor
+from .base import PreprocessorTestsBase
+
+
+class TestExtractAttachments(PreprocessorTestsBase):
+    """Contains test functions for extractattachments.py"""
+
+    def build_preprocessor(self):
+        """Make an instance of a preprocessor"""
+        preprocessor = ExtractAttachmentsPreprocessor()
+        preprocessor.enabled = True
+        return preprocessor
+
+    def test_constructor(self):
+        """Can a ExtractAttachmentsPreprocessor be constructed?"""
+        self.build_preprocessor()
+
+    def test_attachment(self):
+        """Test the output of the ExtractAttachmentsPreprocessor"""
+        nb = self.build_notebook(with_attachment=True)
+        res = self.build_resources()
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Check if attachment was extracted.
+        attachments = nb.cells[-1].attachments
+        self.assertIn("image.png", attachments)
+        self.assertIn("image/png", attachments["image.png"])
+        data = attachments["image.png"]["image/png"]
+        # convert to bytes, b64 decode, convert to str
+        data = b64decode(data.encode("utf-8"))
+        self.assertEqual(data, b"test")
+
+        # Verify attachment
+        self.assertIn("image.png", res["outputs"])
+        self.assertEqual(res["outputs"]["image.png"], b"test")
+
+        # Verify cell source changed appropriately
+        src = nb.cells[-1].source
+        self.assertEqual(src, "![image.png](image.png)")
+
+    def test_attachment_with_directory(self):
+        """Test that cell source modifications are correct when files are put in a directory"""
+        nb = self.build_notebook(with_attachment=True)
+        res = self.build_resources()
+        output_dir = "outputs"
+        res["output_files_dir"] = output_dir
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Verify attachment
+        # This can have "\\" separator on Windows
+        file_path = os.path.join("outputs", "image.png")
+        self.assertIn(file_path, res["outputs"])
+
+        # Verify cell source changed appropriately
+        src = nb.cells[-1].source
+        # This shouldn't change on Windows
+        self.assertEqual(src, "![image.png](outputs/image.png)")
+
+    def test_use_separate_dir_config(self):
+        """Test that use_separate_dir and attachment_directory_template work properly"""
+        nb = self.build_notebook(with_attachment=True)
+        res = self.build_resources()
+        res["unique_key"] = "notebook1"  # add notebook name for the folder
+        preprocessor = self.build_preprocessor()
+        preprocessor.use_separate_dir = True
+        preprocessor.attachments_directory_template = "{notebook_name}_custom"
+        nb, res = preprocessor(nb, res)
+
+        # Verify attachment
+        # This can have "\\" separator on Windows
+        file_path = os.path.join("notebook1_custom", "image.png")
+        self.assertIn(file_path, res["attachments"])
+
+        # Verify cell source changed appropriately
+        src = nb.cells[-1].source
+        # This shouldn't change on Windows
+        self.assertEqual(src, "![image.png](notebook1_custom/image.png)")
diff --git a/nbconvert/writers/files.py b/nbconvert/writers/files.py
@@ -51,6 +51,19 @@ def _makedir(self, path):
             self.log.info("Making directory %s", path)
             ensure_dir_exists(path)
 
+    def _write_items(self, items, build_dir):
+        """Write a dict containing filename->binary data"""
+        for filename, data in items:
+            # Determine where to write the file to
+            dest = os.path.join(build_dir, filename)
+            path = os.path.dirname(dest)
+            self._makedir(path)
+
+            # Write file
+            self.log.debug("Writing %i bytes to %s", len(data), dest)
+            with open(dest, "wb") as f:
+                f.write(data)
+
     def write(self, output, resources, notebook_name=None, **kw):
         """
         Consume and write Jinja output to the file system.  Output directory
@@ -73,7 +86,7 @@ def write(self, output, resources, notebook_name=None, **kw):
         relpath = self.relpath or resource_path
         build_directory = self.build_directory or resource_path
 
-        # Write all of the extracted resources to the destination directory.
+        # Write the extracted outputs to the destination directory.
         # NOTE: WE WRITE EVERYTHING AS-IF IT'S BINARY.  THE EXTRACT FIG
         # PREPROCESSOR SHOULD HANDLE UNIX/WINDOWS LINE ENDINGS...
 
@@ -83,16 +96,17 @@ def write(self, output, resources, notebook_name=None, **kw):
                 "Support files will be in %s",
                 os.path.join(resources.get("output_files_dir", ""), ""),
             )
-        for filename, data in items:
-            # Determine where to write the file to
-            dest = os.path.join(build_directory, filename)
-            path = os.path.dirname(dest)
-            self._makedir(path)
+            self._write_items(items, build_directory)
 
-            # Write file
-            self.log.debug("Writing %i bytes to support file %s", len(data), dest)
-            with open(dest, "wb") as f:
-                f.write(data)
+        # Write the extracted attachments
+        # if ExtractAttachmentsOutput specified a separate directory
+        attachs = resources.get("attachments", {}).items()
+        if attachs:
+            self.log.info(
+                "Attachments will be in %s",
+                os.path.join(resources.get("attachment_files_dir", ""), ""),
+            )
+            self._write_items(attachs, build_directory)
 
         # Copy referenced files to output directory
         if build_directory: