Skip to content

Commit

Permalink
Add ExtractAttachmentsPreprocessor (#1978)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Steven Silvester <steven.silvester@ieee.org>
  • Loading branch information
3 people committed May 6, 2023
1 parent b56c272 commit 27804f1
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 20 deletions.
1 change: 1 addition & 0 deletions nbconvert/exporters/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class Exporter(LoggingConfigurable):
"nbconvert.preprocessors.LatexPreprocessor",
"nbconvert.preprocessors.HighlightMagicsPreprocessor",
"nbconvert.preprocessors.ExtractOutputPreprocessor",
"nbconvert.preprocessors.ExtractAttachmentsPreprocessor",
"nbconvert.preprocessors.ClearMetadataPreprocessor",
],
help="""List of preprocessors available by default, by name, namespace,
Expand Down
1 change: 1 addition & 0 deletions nbconvert/exporters/latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def default_config(self):
"text/plain",
]
},
"ExtractAttachmentsPreprocessor": {"enabled": True},
"ExtractOutputPreprocessor": {"enabled": True},
"SVG2PDFPreprocessor": {"enabled": True},
"LatexPreprocessor": {"enabled": True},
Expand Down
1 change: 1 addition & 0 deletions nbconvert/exporters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def _raw_mimetypes_default(self):
def default_config(self):
c = Config(
{
"ExtractAttachmentsPreprocessor": {"enabled": True},
"ExtractOutputPreprocessor": {"enabled": True},
"NbConvertBase": {
"display_data_priority": [
Expand Down
3 changes: 2 additions & 1 deletion nbconvert/exporters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,9 @@ def from_notebook_node(self, nb, resources=None, **kw):
# convert output extension to pdf
# the writer above required it to be tex
resources["output_extension"] = ".pdf"
# clear figure outputs, extracted by latex export,
# clear figure outputs and attachments, extracted by latex export,
# so we don't claim to be a multi-file export.
resources.pop("outputs", None)
resources.pop("attachments", None)

return pdf_data, resources
1 change: 1 addition & 0 deletions nbconvert/preprocessors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .convertfigures import ConvertFiguresPreprocessor
from .csshtmlheader import CSSHTMLHeaderPreprocessor
from .execute import ExecutePreprocessor
from .extractattachments import ExtractAttachmentsPreprocessor
from .extractoutput import ExtractOutputPreprocessor
from .highlightmagics import HighlightMagicsPreprocessor
from .latex import LatexPreprocessor
Expand Down
108 changes: 108 additions & 0 deletions nbconvert/preprocessors/extractattachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
Module that extracts attachments from notebooks into their own files
"""

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.

import os
from base64 import b64decode

from traitlets import Bool, Unicode

from .base import Preprocessor


class ExtractAttachmentsPreprocessor(Preprocessor):
"""
Extracts attachments from all (markdown and raw) cells in a notebook.
The extracted attachments are stored in a directory ('attachments' by default).
https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
"""

attachments_directory_template = Unicode(
"{notebook_name}_attachments",
help="Directory to place attachments if use_separate_dir is True",
).tag(config=True)

use_separate_dir = Bool(
False,
help="Whether to use output_files_dir (which ExtractOutput also uses) or "
"create a separate directory for attachments",
).tag(config=True)

def __init__(self, **kw):
"""
Public constructor
"""
super().__init__(**kw)
# directory path,
self.path_name = "" # will be set in self.preprocess, needs resources
# Where extracted attachments are stored in resources
self.resources_item_key = (
"attachments" # Here as a default, in case someone doesn't want to call preprocess
)

# Add condition and configurability here
def preprocess(self, nb, resources):
"""
Determine some settings and apply preprocessor to notebook
"""
if self.use_separate_dir:
self.path_name = self.attachments_directory_template.format(
notebook_name=resources["unique_key"]
)
# Initialize resources for attachments
resources["attachment_files_dir"] = self.path_name
resources["attachments"] = {}
self.resources_item_key = "attachments"
else:
# Use same resources as ExtractOutput
self.path_name = resources["output_files_dir"]
self.resources_item_key = "outputs"

# Make sure key exists
if not isinstance(resources[self.resources_item_key], dict):
resources[self.resources_item_key] = {}

nb, resources = super().preprocess(nb, resources)
return nb, resources

def preprocess_cell(self, cell, resources, index):
"""
Extract attachments to individual files and
change references to them.
E.g.
'![image.png](attachment:021fdd80.png)'
becomes
'![image.png]({path_name}/021fdd80.png)'
Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
"""
if "attachments" in cell:
for fname in cell.attachments:
self.log.debug(f"Encountered attachment {fname}")

# Add file for writer

# Right now I don't know of a situation where there would be multiple
# mime types under same filename, and I can't index into it without the mimetype.
# So I only read the first one.
for mimetype in cell.attachments[fname]:
# convert to bytes and decode
data = cell.attachments[fname][mimetype].encode("utf-8")
decoded = b64decode(data)
break

# FilesWriter wants path to be in attachment filename here
new_filename = os.path.join(self.path_name, fname)
resources[self.resources_item_key][new_filename] = decoded

# Edit the reference to the attachment

# os.path.join on windows uses "\\" separator,
# but files like markdown still want "/"
if os.path.sep != "/":
new_filename = new_filename.replace(os.path.sep, "/")
cell.source = cell.source.replace("attachment:" + fname, new_filename)

return cell, resources
17 changes: 16 additions & 1 deletion nbconvert/preprocessors/tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.

from base64 import b64encode

from nbformat import v4 as nbformat

from ...exporters.exporter import ResourcesDict
Expand All @@ -12,7 +14,7 @@
class PreprocessorTestsBase(TestsBase):
"""Contains test functions preprocessor tests"""

def build_notebook(self, with_json_outputs=False):
def build_notebook(self, with_json_outputs=False, with_attachment=False):
"""Build a notebook in memory for use with preprocessor tests"""

outputs = [
Expand Down Expand Up @@ -42,6 +44,19 @@ def build_notebook(self, with_json_outputs=False):
nbformat.new_markdown_cell(source="$ e $"),
]

if with_attachment:
data = b"test"
encoded_data = b64encode(data)
# this is conversion of bytes to string, not base64 decoding
attachments = {"image.png": {"image/png": encoded_data.decode()}}
cells.extend(
[
nbformat.new_markdown_cell(
source="![image.png](attachment:image.png)", attachments=attachments
)
]
)

return nbformat.new_notebook(cells=cells)

def build_resources(self):
Expand Down
87 changes: 87 additions & 0 deletions nbconvert/preprocessors/tests/test_extractattachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Tests for the ExtractAttachments preprocessor"""

# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.

import os
from base64 import b64decode

from ..extractattachments import ExtractAttachmentsPreprocessor
from .base import PreprocessorTestsBase


class TestExtractAttachments(PreprocessorTestsBase):
"""Contains test functions for extractattachments.py"""

def build_preprocessor(self):
"""Make an instance of a preprocessor"""
preprocessor = ExtractAttachmentsPreprocessor()
preprocessor.enabled = True
return preprocessor

def test_constructor(self):
"""Can a ExtractAttachmentsPreprocessor be constructed?"""
self.build_preprocessor()

def test_attachment(self):
"""Test the output of the ExtractAttachmentsPreprocessor"""
nb = self.build_notebook(with_attachment=True)
res = self.build_resources()
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Check if attachment was extracted.
attachments = nb.cells[-1].attachments
self.assertIn("image.png", attachments)
self.assertIn("image/png", attachments["image.png"])
data = attachments["image.png"]["image/png"]
# convert to bytes, b64 decode, convert to str
data = b64decode(data.encode("utf-8"))
self.assertEqual(data, b"test")

# Verify attachment
self.assertIn("image.png", res["outputs"])
self.assertEqual(res["outputs"]["image.png"], b"test")

# Verify cell source changed appropriately
src = nb.cells[-1].source
self.assertEqual(src, "![image.png](image.png)")

def test_attachment_with_directory(self):
"""Test that cell source modifications are correct when files are put in a directory"""
nb = self.build_notebook(with_attachment=True)
res = self.build_resources()
output_dir = "outputs"
res["output_files_dir"] = output_dir
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Verify attachment
# This can have "\\" separator on Windows
file_path = os.path.join("outputs", "image.png")
self.assertIn(file_path, res["outputs"])

# Verify cell source changed appropriately
src = nb.cells[-1].source
# This shouldn't change on Windows
self.assertEqual(src, "![image.png](outputs/image.png)")

def test_use_separate_dir_config(self):
"""Test that use_separate_dir and attachment_directory_template work properly"""
nb = self.build_notebook(with_attachment=True)
res = self.build_resources()
res["unique_key"] = "notebook1" # add notebook name for the folder
preprocessor = self.build_preprocessor()
preprocessor.use_separate_dir = True
preprocessor.attachments_directory_template = "{notebook_name}_custom"
nb, res = preprocessor(nb, res)

# Verify attachment
# This can have "\\" separator on Windows
file_path = os.path.join("notebook1_custom", "image.png")
self.assertIn(file_path, res["attachments"])

# Verify cell source changed appropriately
src = nb.cells[-1].source
# This shouldn't change on Windows
self.assertEqual(src, "![image.png](notebook1_custom/image.png)")
34 changes: 24 additions & 10 deletions nbconvert/writers/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@ def _makedir(self, path):
self.log.info("Making directory %s", path)
ensure_dir_exists(path)

def _write_items(self, items, build_dir):
"""Write a dict containing filename->binary data"""
for filename, data in items:
# Determine where to write the file to
dest = os.path.join(build_dir, filename)
path = os.path.dirname(dest)
self._makedir(path)

# Write file
self.log.debug("Writing %i bytes to %s", len(data), dest)
with open(dest, "wb") as f:
f.write(data)

def write(self, output, resources, notebook_name=None, **kw):
"""
Consume and write Jinja output to the file system. Output directory
Expand All @@ -73,7 +86,7 @@ def write(self, output, resources, notebook_name=None, **kw):
relpath = self.relpath or resource_path
build_directory = self.build_directory or resource_path

# Write all of the extracted resources to the destination directory.
# Write the extracted outputs to the destination directory.
# NOTE: WE WRITE EVERYTHING AS-IF IT'S BINARY. THE EXTRACT FIG
# PREPROCESSOR SHOULD HANDLE UNIX/WINDOWS LINE ENDINGS...

Expand All @@ -83,16 +96,17 @@ def write(self, output, resources, notebook_name=None, **kw):
"Support files will be in %s",
os.path.join(resources.get("output_files_dir", ""), ""),
)
for filename, data in items:
# Determine where to write the file to
dest = os.path.join(build_directory, filename)
path = os.path.dirname(dest)
self._makedir(path)
self._write_items(items, build_directory)

# Write file
self.log.debug("Writing %i bytes to support file %s", len(data), dest)
with open(dest, "wb") as f:
f.write(data)
# Write the extracted attachments
# if ExtractAttachmentsOutput specified a separate directory
attachs = resources.get("attachments", {}).items()
if attachs:
self.log.info(
"Attachments will be in %s",
os.path.join(resources.get("attachment_files_dir", ""), ""),
)
self._write_items(attachs, build_directory)

# Copy referenced files to output directory
if build_directory:
Expand Down
Loading

0 comments on commit 27804f1

Please sign in to comment.