From 76d6cb091e9789c5f6f8b1bb0b411664266fe00e Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Wed, 28 Sep 2022 19:11:49 -0400 Subject: [PATCH 01/14] Set up a new config format --- n2y/config.py | 49 +++++++++++++++++++++++++++++++++++++ tests/test_config.py | 57 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/n2y/config.py b/n2y/config.py index b4914489..e0457212 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -1,5 +1,8 @@ import json import logging +import copy + +import yaml from n2y.utils import strip_hyphens @@ -7,6 +10,52 @@ logger = logging.getLogger(__name__) +MASTER_DEFAULTS = { + "id_property": None, + "content_property": None, + "url_property": None, + "filename_property": None, + "plugins": [], + "filter": None, +} + + +def load_config(path): + try: + with open(path, "r") as config_file: + config = yaml.safe_load(config_file) + except yaml.YAMLError as exc: + logger.error("Error parsing the config file: %s", exc) + return None + except FileNotFoundError: + logger.error("The config file '%s' does not exist", path) + return None + merged_exports = merge_config( + config.get("exports", []), + MASTER_DEFAULTS, + config.get("export_defaults", {}), + ) + config["exports"] = merged_exports + # TODO: update how validate_database_config works to handle the new format + # TODO: validate the database config using the updated validate_database_config + return config + + +def merge_config(config_items, master_defaults, defaults): + """ + For each config item, merge in both the user provided defaults and the + builtin master defaults for each key value pair." + """ + merged_config_items = [] + for config_item in config_items: + master_defaults_copy = copy.deepcopy(master_defaults) + defaults_copy = copy.deepcopy(defaults) + config_item_copy = copy.deepcopy(config_item) + merged_config_item = {**master_defaults_copy, **defaults_copy, **config_item_copy} + merged_config_items.append(merged_config_item) + return merged_config_items + + def database_config_json_to_dict(config_json): try: config = json.loads(config_json) diff --git a/tests/test_config.py b/tests/test_config.py index eff52137..725b2bf0 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,62 @@ -from n2y.config import validate_database_config +import yaml + +from n2y.config import validate_database_config, merge_config, load_config from n2y.notion_mocks import mock_id +def test_load_config_basic(tmp_path): + # use a temporary file to test the config loading + config_path = tmp_path / "config.yaml" + export_id = mock_id() + with open(config_path, "w") as f: + f.write(yaml.dump({ + "export_defaults": { + "id_property": "id", + "url_property": "url", + }, + "exports": [ + { + "id": export_id, + "pandoc_format": "gfm", + } + ] + })) + config = load_config(config_path) + merged_export = config["exports"][0] + assert merged_export["id"] == export_id + assert merged_export["id_property"] == "id" + assert merged_export["url_property"] == "url" + assert merged_export["pandoc_format"] == "gfm" + + +def test_merge_config_no_defaults(): + master_defaults = {"a": "1"} + defaults = {} + config_items = [ + {"b": "1"}, + {"b": "2", "a": "2"}, + ] + assert merge_config(config_items, master_defaults, defaults) == [ + {"a": "1", "b": "1"}, + {"b": "2", "a": "2"}, + ] + + +def test_merge_config_defaults(): + master_defaults = {"a": "1", "b": "1"} + defaults = {"a": "3"} + config_items = [ + {}, + {"a": "2"}, + {"b": "2"}, + ] + assert merge_config(config_items, master_defaults, defaults) == [ + {"a": "3", "b": "1"}, + {"a": "2", "b": "1"}, + {"a": "3", "b": "2"}, + ] + + def test_validate_database_config_empty(): assert validate_database_config({}) From ec5f4ec533495c92351948fe69aad49fd4996bcb Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Thu, 29 Sep 2022 15:43:34 -0400 Subject: [PATCH 02/14] Add some config validation and documentation --- README.md | 26 ++++++++++++ n2y/config.py | 99 ++++++++++++++++++++++++++++++-------------- tests/test_config.py | 73 ++++++++++++++++++++------------ 3 files changed, 140 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 162d9248..6d31ec60 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,32 @@ To do this, go to the "Settings and Members" page in Notion. You should see an " Finally, in Notion you'll need to share the relevant pages with your internal integration---just like you'd share a page with another person. +## Configuration + +N2y is configured using a single YAML file. This file contains a few top-level keys: + +The `exports` key contains a list of pages or databases to be exported. Each export config item is an object with the following keys: + +| Export key | Description | +| --- | --- | +| id | The notion database or page id, taken from the "share URL". | +| node_type | Either "database_as_yaml", "database_as_files", or "page". | +| pandoc_format | The [pandoc format](https://pandoc.org/MANUAL.html#general-options) that we're generating. | +| pandoc_options | A list of strings that are [writer options](https://pandoc.org/MANUAL.html#general-writer-options) for pandoc. | +| content_property | When set, it indicates the property name that will contain the content of the notion pages in that databse. If set to `None`, then only the page's properties will be included in the export. (Only applies to the `database_as_files` node type.) | +| id_property | When set, this indicates the property name in which to place the page's underlying notion ID. | +| url_property | When set, this indicates the property name in which to place the page's underlying notion url. | +| filename_property | This key is required for the "database_as_files" node type; when set, it indicates which property to use when generating the file name. | +| plugins | A list of python modules to use as plugins. | +| notion_filter | A [notion filter object](https://developers.notion.com/reference/post-database-query-filter) to be applied to the database. | +| notion_sort | A [notion sort object](https://developers.notion.com/reference/post-database-query-sort) to be applied to the database. | + +Each export entry can set these arguments differently. Default values for all of these keys, except for `id` and `node_type`, can be set using the `export_defaults` key. + +The `media_url` key sets the base URL for all downloaded media files (e.g., images, videos, PDFs, etc.). + +The `media_path` key sets the directory where media files should be downloaded to. + ## Example Usage ### Convert a Database to YAML diff --git a/n2y/config.py b/n2y/config.py index e0457212..df034a7d 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -1,4 +1,3 @@ -import json import logging import copy @@ -14,9 +13,14 @@ "id_property": None, "content_property": None, "url_property": None, - "filename_property": None, "plugins": [], - "filter": None, + "notion_filter": [], + "notion_sort": [], + "pandoc_formation": "gfm+tex_math_dollars+raw_attribute", + "pandoc_options": [ + '--wrap', 'none', # don't hard line-wrap + '--eol', 'lf', # use linux-style line endings + ], } @@ -30,14 +34,16 @@ def load_config(path): except FileNotFoundError: logger.error("The config file '%s' does not exist", path) return None + if not validate_config(config): + logger.error("Invalid config file: %s", path) + return None + merged_exports = merge_config( config.get("exports", []), MASTER_DEFAULTS, config.get("export_defaults", {}), ) config["exports"] = merged_exports - # TODO: update how validate_database_config works to handle the new format - # TODO: validate the database config using the updated validate_database_config return config @@ -56,35 +62,66 @@ def merge_config(config_items, master_defaults, defaults): return merged_config_items -def database_config_json_to_dict(config_json): - try: - config = json.loads(config_json) - except json.JSONDecodeError as exc: - logger.error("Error parsing the data config JSON: %s", exc.msg) - return None - if not validate_database_config(config): - return None - return config +def validate_config(config): + if "media_root" not in config: + logger.error("Config missing the 'media_root' key") + return False + if "media_root" not in config: + logger.error("Config missing the 'media_root' key") + return False + if "exports" not in config: + logger.error("Config missing the 'exports' key") + return False + if not isinstance(config["exports"], list) and len(config["exports"]) > 0: + logger.error("Config 'exports' key must be a non-empty list") + return False + for export in config["exports"]: + if not _validate_config_item(export): + return False + # TODO: validate the export defaults key + return True -def validate_database_config(config): - try: - for database_id, config_values in config.items(): - if not _valid_id(database_id): - logger.error("Invalid database id in database config: %s", database_id) - return False - for key, values in config_values.items(): - if key not in ["sorts", "filter"]: - logger.error("Invalid key in database config: %s", key) - return False - if not isinstance(values, dict) and not isinstance(values, list): - logger.error( - "Invalid value of type '%s' for key '%s' in database config, " - "expected dict or list", type(values), key, - ) - return False - except AttributeError: +def _validate_config_item(config_item): + if "id" not in config_item: + logger.error("Export config item missing the 'id' key") + return False + if not _valid_id(config_item["id"]): + logger.error("Invalid id in export config item: %s", config_item["id"]) + if "node_type" not in config_item: + logger.error("Export config item missing the 'node_type' key") + return False + if config_item["node_type"] not in ["page", "database_as_yaml", "database_as_files"]: + logger.error("Invalid node_type in export config item: %s", config_item["node_type"]) + return False + if config_item["node_type"] == "database_as_files" and "filename_property" not in config_item: + logger.error("Missing the 'filename_property' key when node_type is 'database_as_files'") + return False + if "notion_filter" in config_item: + if not _valid_notion_filter(config_item["notion_filter"]): + return False + if "notion_sort" in config_item: + if not _valid_notion_sort(config_item["notion_sort"]): + return False + # TODO: validate pandoc_formation + # TODO: validate pandoc_options + # TODO: validate plugins + return True + + +def _valid_notion_filter(notion_filter): + if not (isinstance(notion_filter, list) or isinstance(notion_filter, dict)): + logger.error("notion_sort must be a list or dict") + return False + # TODO validate keys and values + return True + + +def _valid_notion_sort(notion_sort): + if not (isinstance(notion_sort, list) or isinstance(notion_sort, dict)): + logger.error("notion_sort must be a list or dict") return False + # TODO validate keys and values return True diff --git a/tests/test_config.py b/tests/test_config.py index 725b2bf0..125e6b5a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,15 +1,28 @@ +import copy import yaml -from n2y.config import validate_database_config, merge_config, load_config +from n2y.config import ( + _valid_id, validate_config, merge_config, load_config, _valid_notion_filter, + _validate_config_item, MASTER_DEFAULTS +) from n2y.notion_mocks import mock_id +def mock_config_item(node_type): + config_item = copy.deepcopy(MASTER_DEFAULTS) + config_item["id"] = mock_id() + config_item["node_type"] = node_type + return config_item + + def test_load_config_basic(tmp_path): # use a temporary file to test the config loading config_path = tmp_path / "config.yaml" export_id = mock_id() with open(config_path, "w") as f: f.write(yaml.dump({ + "media_root": "media", + "media_url": "https://example.com/media", "export_defaults": { "id_property": "id", "url_property": "url", @@ -17,6 +30,7 @@ def test_load_config_basic(tmp_path): "exports": [ { "id": export_id, + "node_type": "page", "pandoc_format": "gfm", } ] @@ -24,6 +38,7 @@ def test_load_config_basic(tmp_path): config = load_config(config_path) merged_export = config["exports"][0] assert merged_export["id"] == export_id + assert merged_export["node_type"] == "page" assert merged_export["id_property"] == "id" assert merged_export["url_property"] == "url" assert merged_export["pandoc_format"] == "gfm" @@ -57,42 +72,46 @@ def test_merge_config_defaults(): ] -def test_validate_database_config_empty(): - assert validate_database_config({}) +def test_valid_id_valid(): + assert _valid_id(mock_id()) -def test_validate_database_config_no_props(): - assert validate_database_config({ - mock_id(): {}, - }) +def test_valid_id_invalid(): + assert not _valid_id(mock_id() + 'a') -def test_validate_database_config_invalid_id(): - invalid_id = mock_id() + 'a' - assert not validate_database_config({ - invalid_id: {}, +def test_valid_notion_filter_simple(): + assert _valid_notion_filter({ + "property": "title", + "direction": "ascending", }) -def test_validate_database_config_invalid_props(): - assert not validate_database_config({ - mock_id(): {'invalid': 'thing'}, - }) +def test_valid_notion_filter_complex(): + assert _valid_notion_filter([{ + "property": "title", + "direction": "ascending", + }]) -def test_validate_database_config_invalid_value(): - assert not validate_database_config({ - mock_id(): {'filter': 'invalid'}, - }) +def test_valid_config_item_missing_id(): + config_item = mock_config_item("page") + del config_item["id"] + assert not _validate_config_item(config_item) -def test_validate_database_config_valid_dict(): - assert validate_database_config({ - mock_id(): {'filter': {}}, - }) +def test_valid_config_item_missing_node_type(): + config_item = mock_config_item("page") + del config_item["node_type"] + assert not _validate_config_item(config_item) -def test_validate_database_config_valid_list(): - assert validate_database_config({ - mock_id(): {'filter': []}, - }) +def test_valid_config_item_invalid_node_type(): + config_item = mock_config_item("page") + config_item["node_type"] = "invalid" + assert not _validate_config_item(config_item) + + +def test_valid_config_item_missing_filename_property(): + config_item = mock_config_item("database_as_files") + assert not _validate_config_item(config_item) From 712a4d7f8cb598f892c92dde01558352006dec9c Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Thu, 29 Sep 2022 22:24:28 -0400 Subject: [PATCH 03/14] Allow multiple exports using a configuration file See the README changelog for more context. In addition, note that the code that is related to exporting notion objects to files is now placed in `export.py`. Previously it had been shared between `notion.py`, `page.py`, `main.py`, and `database.py`. This refactor simplifies the `Client` class and greatly reduces the number of configuration items we need to pass into it. It also makes the separation of concerns cleaner. --- README.md | 59 ++++--- n2y/blocks.py | 7 +- n2y/config.py | 47 +++--- n2y/database.py | 79 +-------- n2y/export.py | 125 ++++++++++++++ n2y/main.py | 247 ++++++++-------------------- n2y/notion.py | 36 ++-- n2y/page.py | 71 +------- n2y/utils.py | 2 + tests/test_audit_end_to_end.py | 8 +- tests/test_config.py | 8 +- tests/test_end_to_end.py | 290 ++++++++++++++------------------- 12 files changed, 410 insertions(+), 569 deletions(-) create mode 100644 n2y/export.py diff --git a/README.md b/README.md index 6d31ec60..08e790bf 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,20 @@ Finally, in Notion you'll need to share the relevant pages with your internal in N2y is configured using a single YAML file. This file contains a few top-level keys: -The `exports` key contains a list of pages or databases to be exported. Each export config item is an object with the following keys: +| Top-level key | Description | +| --- | --- | +| media_url | Sets the base URL for all downloaded media files (e.g., images, videos, PDFs, etc.) | +| media_root | The directory where media files should be downloaded to | +| exports | A list of export configuration items, indicating how a notion page or database is to be exported. See below for the keys. | +| export_defaults | Default values for the export configuration items. | + +The export configuration items may contain the following keys: | Export key | Description | | --- | --- | | id | The notion database or page id, taken from the "share URL". | | node_type | Either "database_as_yaml", "database_as_files", or "page". | +| output | The path the output file, or directory, where the data will be written. | | pandoc_format | The [pandoc format](https://pandoc.org/MANUAL.html#general-options) that we're generating. | | pandoc_options | A list of strings that are [writer options](https://pandoc.org/MANUAL.html#general-writer-options) for pandoc. | | content_property | When set, it indicates the property name that will contain the content of the notion pages in that databse. If set to `None`, then only the page's properties will be included in the export. (Only applies to the `database_as_files` node type.) | @@ -44,38 +52,46 @@ The `exports` key contains a list of pages or databases to be exported. Each exp | filename_property | This key is required for the "database_as_files" node type; when set, it indicates which property to use when generating the file name. | | plugins | A list of python modules to use as plugins. | | notion_filter | A [notion filter object](https://developers.notion.com/reference/post-database-query-filter) to be applied to the database. | -| notion_sort | A [notion sort object](https://developers.notion.com/reference/post-database-query-sort) to be applied to the database. | - -Each export entry can set these arguments differently. Default values for all of these keys, except for `id` and `node_type`, can be set using the `export_defaults` key. - -The `media_url` key sets the base URL for all downloaded media files (e.g., images, videos, PDFs, etc.). +| notion_sorts | A [notion sorts object](https://developers.notion.com/reference/post-database-query-sort) to be applied to the database. | -The `media_path` key sets the directory where media files should be downloaded to. +## Example Configuration Files -## Example Usage +The command is run using `n2y configuration.yaml`. ### Convert a Database to YAML -Copy the link for the database you'd like to export to YAML. Note that linked databases aren't supported. Then run: +A notion database (e.g., with a share URL like this https://www.notion.so/176fa24d4b7f4256877e60a1035b45a4?v=130ffd3224fd4512871bb45dbceaa7b2) could be exported into a YAML file using this minimal configuration file: ``` -n2y DATABASE_LINK > database.yml +exports: +- id: 176fa24d4b7f4256877e60a1035b45a4 + node_type: database_as_yaml + output: database.yml ``` ### Convert a Database to a set of Markdown Files +The same database could be exported into a set of markdown files as follows: + ``` -n2y -f markdown DATABASE_LINK +exports: +- id: 176fa24d4b7f4256877e60a1035b45a4 + node_type: database_as_files + output: directory + filename_property: "Name" ``` -This process will automatically skip untitled pages or pages with duplicate names. +Each page in the database will generate a single markdown file, named according to the `filename_property`. This process will automatically skip pages whose "Name" property is empty. ### Convert a Page to a Markdown File -If the page is in a database, then it's properties will be included in the YAML front matter. If the page is not in a database, then the title of the page will be included in the YAML front matter. +An individual notion page (e.g., with a share URL like this https://www.notion.so/All-Blocks-Test-Page-5f18c7d7eda44986ae7d938a12817cc0) could be exported to markdown with this minimal configuration file: ``` -n2y PAGE_LINK > page.md +exports: +- id: 5f18c7d7eda44986ae7d938a12817cc0 + node_type: page + output: page.md ``` ### Audit a Page and it's Children For External Links @@ -201,12 +217,12 @@ Note that any link to a page that the integration doesn't have access to will be ## Architecture -N2y's architecture is divided into four main steps: +An n2y run is divided into four stages: -1. Configuration +1. Loading the configuration (mostly in `config.py`) 2. Retrieve data from Notion (by instantiating various Notion object instances, e.g., `Page`, `Block`, `RichText`, etc.) 3. Convert to the pandoc AST (by calling `block.to_pandoc()`) -4. Writing the pandoc AST into markdown or YAML +4. Writing the pandoc AST into one of the various output formats (mostly in `export.py`) Every page object has a `parent` property, which may be a page, a database, or a workspace. @@ -245,12 +261,15 @@ Here are some features we're planning to add in the future: - Add support for recursively dumping sets of pages and preserving links between them - Add some sort of Notion API caching mechanism - Add more examples to the documentation -- Make it so that plugins and other configuration can be set for only a sub-set - of the exported pages, that way multiple configurations can be applied in a - single export ## Changelog +### v0.6.0 + +- The export is now configured using a single YAML file instead of the growing list of commandline arguments. Using a configuration file allows multiple page and database exports to be made in a single run, which in turn improves caching and will enable future improvements, like preserving links between generated HTML or markdown pages. +- Added the `pandoc_format` and `pandoc_options` fields, making it possible to output to any format that pandoc supports. +- Removed the ability to export a set of related databases (this is less useful now that we have a configuration file). + ### v0.5.0 - Add support for dumping the notion urls using `--url-property`. diff --git a/n2y/blocks.py b/n2y/blocks.py index 856ced95..e6586507 100644 --- a/n2y/blocks.py +++ b/n2y/blocks.py @@ -582,8 +582,13 @@ def __init__(self, client, notion_data, page, get_children=True): def to_pandoc(self): # TODO: in the future, if we are exporting the linked page too, then add # a link to the page. For now, we just display the text of the page. + if self.link_type == "page_id": + node = self.client.get_page(self.linked_page_id) + elif self.link_type == "database_id": + node = self.client.get_database(self.linked_page_id) + else: + raise NotImplementedError(f"Unknown link type: {self.link_type}") - node = self.client.get_page_or_database(self.linked_page_id) if node is None: msg = "Permission denied when attempting to access linked node [%s]" logger.warning(msg, self.notion_url) diff --git a/n2y/config.py b/n2y/config.py index df034a7d..2c123ed3 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -9,14 +9,20 @@ logger = logging.getLogger(__name__) -MASTER_DEFAULTS = { +DEFAULTS = { + "media_root": "media", + "media_url": "./media/", + "plugins": [], +} + + +EXPORT_DEFAULTS = { "id_property": None, "content_property": None, "url_property": None, - "plugins": [], "notion_filter": [], - "notion_sort": [], - "pandoc_formation": "gfm+tex_math_dollars+raw_attribute", + "notion_sorts": [], + "pandoc_format": "gfm+tex_math_dollars+raw_attribute", "pandoc_options": [ '--wrap', 'none', # don't hard line-wrap '--eol', 'lf', # use linux-style line endings @@ -38,23 +44,26 @@ def load_config(path): logger.error("Invalid config file: %s", path) return None + defaults_copy = copy.deepcopy(DEFAULTS) + config = {**defaults_copy, **config} + merged_exports = merge_config( config.get("exports", []), - MASTER_DEFAULTS, + EXPORT_DEFAULTS, config.get("export_defaults", {}), ) config["exports"] = merged_exports return config -def merge_config(config_items, master_defaults, defaults): +def merge_config(config_items, builtin_defaults, defaults): """ For each config item, merge in both the user provided defaults and the - builtin master defaults for each key value pair." + builtin defaults for each key value pair." """ merged_config_items = [] for config_item in config_items: - master_defaults_copy = copy.deepcopy(master_defaults) + master_defaults_copy = copy.deepcopy(builtin_defaults) defaults_copy = copy.deepcopy(defaults) config_item_copy = copy.deepcopy(config_item) merged_config_item = {**master_defaults_copy, **defaults_copy, **config_item_copy} @@ -63,12 +72,6 @@ def merge_config(config_items, master_defaults, defaults): def validate_config(config): - if "media_root" not in config: - logger.error("Config missing the 'media_root' key") - return False - if "media_root" not in config: - logger.error("Config missing the 'media_root' key") - return False if "exports" not in config: logger.error("Config missing the 'exports' key") return False @@ -97,29 +100,31 @@ def _validate_config_item(config_item): if config_item["node_type"] == "database_as_files" and "filename_property" not in config_item: logger.error("Missing the 'filename_property' key when node_type is 'database_as_files'") return False + if "output" not in config_item: + logger.error("Export config item missing the 'output' key") + return False if "notion_filter" in config_item: if not _valid_notion_filter(config_item["notion_filter"]): return False - if "notion_sort" in config_item: - if not _valid_notion_sort(config_item["notion_sort"]): + if "notion_sorts" in config_item: + if not _valid_notion_sort(config_item["notion_sorts"]): return False # TODO: validate pandoc_formation # TODO: validate pandoc_options - # TODO: validate plugins return True def _valid_notion_filter(notion_filter): if not (isinstance(notion_filter, list) or isinstance(notion_filter, dict)): - logger.error("notion_sort must be a list or dict") + logger.error("notion_filter must be a list or dict") return False # TODO validate keys and values return True -def _valid_notion_sort(notion_sort): - if not (isinstance(notion_sort, list) or isinstance(notion_sort, dict)): - logger.error("notion_sort must be a list or dict") +def _valid_notion_sort(notion_sorts): + if not (isinstance(notion_sorts, list) or isinstance(notion_sorts, dict)): + logger.error("notion_sorts must be a list or dict") return False # TODO validate keys and values return True diff --git a/n2y/database.py b/n2y/database.py index 508962c8..8c4812f8 100644 --- a/n2y/database.py +++ b/n2y/database.py @@ -1,8 +1,5 @@ import logging -import yaml - -from n2y.property_values import RelationPropertyValue from n2y.utils import fromisoformat, sanitize_filename @@ -43,6 +40,11 @@ def children(self): self._children = self.client.get_database_pages(self.notion_id) return self._children + def children_filtered(self, filter, sort=None): + if self._children is None: + self._children = self.client.get_database_pages(self.notion_id, filter, sort) + return self._children + @property def parent(self): if self.notion_parent["type"] == "workspace": @@ -50,76 +52,5 @@ def parent(self): else: return self.client.get_page(self.notion_parent["page_id"]) - @property - def related_database_ids(self): - """ - This method is much more complicated than it should be due to - limitations of the Notion API. - - First, one would expect that the RelationProperty objects would be - present in the databases's properties features, however they do not - show up _unless_ the relationship is back to the same database. - - Secondly, one would expect that the page property endpoint - (https://developers.notion.com/reference/retrieve-a-page-property) - would enable one to retrieve the related database id from the property - directly, however, the database id doesn't appear to be returned there - either. - - As a last result, this method will first get the first page in a - database (raising an error if there are no pages). Then, it will loop - through the properties of the page to find any relationship properties. - Then, it will loop through all pages in the database to find one that - actually has a value of a page in the related database. Finally, we - retrieve the related page and get the database ID from the parent. - """ - ids = [] - database_title = self.title.to_plain_text() - if len(self.children) == 0: - logger.error( - 'Unable to identify relationships for empty database "%s"', - database_title - ) - return ids - first_page = self.children[0] - for prop_name, prop in first_page.properties.items(): - if isinstance(prop, RelationPropertyValue): - related_page_id = None - for page in self.children: - related_page_ids = page.properties[prop_name].ids - if len(related_page_ids) > 0: - related_page_id = related_page_ids[0] - break - if related_page_id is None: - logger.error( - 'Unable to identify related database for relationship "%s" ' - 'property in the "%s" database because there are no values ' - 'in the entire database (%s)', - prop_name, - database_title, - self.notion_url, - ) - else: - related_page = self.client.get_page(related_page_id) - assert related_page.notion_parent["type"] == "database_id" - ids.append(related_page.notion_parent["database_id"]) - return ids - def to_pandoc(self): return self.block.to_pandoc() - - def to_yaml(self): - content_property = self.client.content_property - if content_property in self.schema: - logger.warning( - 'The content property "%s" is shadowing an existing ' - 'property with the same name', content_property, - ) - results = [] - for page in self.children: - result = page.properties_to_values() - if content_property: - content = page.content_to_markdown() - result[content_property] = content - results.append(result) - return yaml.dump(results, sort_keys=False) diff --git a/n2y/export.py b/n2y/export.py new file mode 100644 index 00000000..3207fef4 --- /dev/null +++ b/n2y/export.py @@ -0,0 +1,125 @@ +""" +This module contains all the code responsible for exporting `page.Page` and +`database.Database` objects into the various supported file formats. +""" +import os +import logging + +import yaml + +from n2y.utils import pandoc_write_or_log_errors, sanitize_filename + +logger = logging.getLogger(__name__) + + +def _page_properties(page, id_property, url_property): + properties = page.properties_to_values() + if id_property in properties: + logger.warning( + 'The id property "%s" is shadowing an existing ' + 'property with the same name', id_property, + ) + if id_property: + properties[id_property] = page.notion_id + + if url_property in properties: + logger.warning( + 'The url property "%s" is shadowing an existing ' + 'property with the same name', url_property, + ) + if url_property: + properties[url_property] = page.notion_url + return properties + + +def export_page(page, pandoc_format, pandoc_options, id_property=None, url_property=None): + page_properties = _page_properties(page, id_property, url_property) + pandoc_ast = page.to_pandoc() + page_content = pandoc_write_or_log_errors(pandoc_ast, pandoc_format, pandoc_options) + return '\n'.join([ + '---', + yaml.dump(page_properties) + '---', + page_content, + ]) + + +def database_to_yaml( + database, + pandoc_format, + pandoc_options, + id_property=None, + url_property=None, + content_property=None, + notion_filter=None, + notion_sorts=None, +): + if content_property in database.schema: + logger.warning( + 'The content property "%s" is shadowing an existing ' + 'property with the same name', content_property, + ) + results = [] + for page in database.children_filtered(notion_filter, notion_sorts): + result = _page_properties(page, id_property, url_property) + if content_property: + pandoc_ast = page.to_pandoc() + if pandoc_ast: + result[content_property] = pandoc_write_or_log_errors( + pandoc_ast, pandoc_format, pandoc_options, + ) + else: + result[content_property] = None + results.append(result) + return yaml.dump(results, sort_keys=False) + + +def database_to_markdown_files( + database, + directory, + pandoc_format, + pandoc_options, + filename_property=None, + notion_filter=None, + notion_sorts=None, + id_property=None, + url_property=None, +): + os.makedirs(directory, exist_ok=True) + seen_file_names = set() + counts = {'unnamed': 0, 'duplicate': 0} + for page in database.children_filtered(notion_filter, notion_sorts): + page_filename = _page_filename(page, filename_property) + if page_filename: + if page_filename not in seen_file_names: + seen_file_names.add(page_filename) + with open(os.path.join(directory, f"{page_filename}.md"), 'w') as f: + document = export_page( + page, + pandoc_format, + pandoc_options, + id_property, + url_property, + ) + f.write(document) + else: + logger.warning('Skipping page named "%s" since it has been used', page_filename) + counts['duplicate'] += 1 + else: + counts['unnamed'] += 1 + for key, count in counts.items(): + if count > 0: + logger.info("%d %s page(s) skipped", count, key) + + +def _page_filename(page, filename_property): + # TODO: switch to using the database's natural keys as the file names + if filename_property is None: + return sanitize_filename(page.title.to_plain_text()) + elif filename_property in page.properties: + return sanitize_filename(page.properties[filename_property].to_value()) + else: + logger.warning( + 'Invalid filename property, "%s". Valid options are %s', + filename_property, ", ".join(page.properties.keys()), + ) + return sanitize_filename(page.title.to_plain_text()) diff --git a/n2y/main.py b/n2y/main.py index 708f482b..5f415bba 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -4,11 +4,8 @@ import argparse from n2y import notion -from n2y.database import Database -from n2y.page import Page -from n2y.errors import APIErrorCode, APIResponseError -from n2y.utils import id_from_share_link -from n2y.config import database_config_json_to_dict +from n2y.export import export_page, database_to_yaml, database_to_markdown_files +from n2y.config import load_config logger = None @@ -24,76 +21,11 @@ def main(raw_args, access_token): description="Move data from Notion into YAML/markdown", formatter_class=argparse.RawTextHelpFormatter, ) - parser.add_argument("object_id", help="The id or url for a Notion database or page") - parser.add_argument( - "--format", '-f', - choices=["yaml", "yaml-related", "markdown", "html"], default="yaml", - help=( - "Select output type (only applies to databases)\n" - " yaml - log yaml to stdout\n" - " yaml-related - save all related databases to a set of YAML files\n" - " markdown - create a markdown file for each page" - " html - create an html file for each page" - ) - ) - parser.add_argument( - "--content-property", default='', - help=( - "Store each database page's content in this property. " - "The page's content isn't exported if it's set to a blank string. " - "Only applies when dumping a database to YAML." - ) - ) - parser.add_argument( - "--id-property", default='id', - help=( - "Store each database page's id in this property. " - "The page's id isn't exported if it's set to a blank string. " - ) - ) - parser.add_argument( - "--url-property", default='url', - help=( - "Store each database page's url in this property. " - "The page's id isn't exported if it's set to a blank string. " - ) - ) - parser.add_argument( - "--filename-property", default=None, - help=( - "The database property used to generate the filename for its pages. " - "Only applies when dumping a database to markdown files." - ) - ) - parser.add_argument( - "--media-root", help="Filesystem path to directory where images and media are saved" - ) - parser.add_argument("--media-url", help="URL for media root; must end in slash if non-empty") - parser.add_argument( - "--plugin", '-p', action='append', - help="Plugin module location, e.g. ('n2y.plugins.deepheaders')", - ) - parser.add_argument( - "--output", '-o', default='./', - help="Relative path to output directory", - ) + parser.add_argument("config", help="The path to the config file") parser.add_argument( "--verbosity", '-v', default='INFO', help="Level to set the root logging module to", ) - parser.add_argument( - "--logging-format", default='%(asctime)s - %(levelname)s: %(message)s', - help="Default format used when logging", - ) - parser.add_argument( - "--database-config", default='{}', - help=( - "A JSON string in the format {database_id: {sorts: {...}, filter: {...}}}. " - "These can be used to filter and sort databases. See " - "https://developers.notion.com/reference/post-database-query-filter and " - "https://developers.notion.com/reference/post-database-query-sort" - ) - ) # TODO: Add the ability to dump out a "schema" file that contains the schema # for a set of databases @@ -103,7 +35,7 @@ def main(raw_args, access_token): args = parser.parse_args(raw_args) logging_level = logging.__dict__[args.verbosity] - logging.basicConfig(format=args.logging_format, level=logging_level) + logging.basicConfig(level=logging_level) global logger logger = logging.getLogger(__name__) @@ -111,122 +43,75 @@ def main(raw_args, access_token): logger.critical('No NOTION_ACCESS_TOKEN environment variable is set') return 1 - object_id = id_from_share_link(args.object_id) - media_root = args.media_root or args.output - - database_config = database_config_json_to_dict(args.database_config) - valid_database_config = database_config is not None - if not valid_database_config: - logger.critical( - 'Database config validation failed. Please make sure you pass in ' - 'a JSON string with the format {database_id: {sorts: {...}, filter: {...}}}' - ) - return 1 + config = load_config(args.config) + if config is None: + return 2 client = notion.Client( access_token, - media_root, - args.media_url, - plugins=args.plugin, - content_property=args.content_property, - id_property=args.id_property, - url_property=args.url_property, - filename_property=args.filename_property, - database_config=database_config, + config["media_root"], + config["media_url"], + config["plugins"], ) - node = client.get_page_or_database(object_id) - - if isinstance(node, Database) and args.format == 'markdown': - export_database_as_markdown_files(node, options=args) - if isinstance(node, Database) and args.format == 'html': - export_database_as_html_files(node, options=args) - elif isinstance(node, Database) and args.format == 'yaml': - print(node.to_yaml()) - elif isinstance(node, Database) and args.format == 'yaml-related': - export_related_databases(node, options=args) - elif isinstance(node, Page): - print(node.to_markdown()) - elif node is None: - msg = ( - "Unable to find database or page with id %s. " - "Perhaps its not shared with the integration?" - ) - logger.error(msg, object_id) - return 2 - - return 0 - - -def export_database_as_markdown_files(database, options): - os.makedirs(options.output, exist_ok=True) - seen_file_names = set() - counts = {'unnamed': 0, 'duplicate': 0} - for page in database.children: - if page.filename: - if page.filename not in seen_file_names: - seen_file_names.add(page.filename) - with open(os.path.join(options.output, f"{page.filename}.md"), 'w') as f: - f.write(page.to_markdown()) - else: - logger.warning('Skipping page named "%s" since it has been used', page.filename) - counts['duplicate'] += 1 + for export in config['exports']: + # TODO: swap out plugins + node_type = export["node_type"] + if node_type == "page": + page = client.get_page(export['id']) + if page is None: + msg = ( + "Unable to find page with id '%s'. " + "Perhaps the integration doesn't have permission to access this page?" + ) + logger.error(msg, export['id']) + continue + result = export_page( + page, + export["pandoc_format"], + export["pandoc_options"], + export["id_property"], + export["url_property"], + ) + with open(export["output"], "w") as f: + f.write(result) else: - counts['unnamed'] += 1 - for key, count in counts.items(): - if count > 0: - logger.info("%d %s page(s) skipped", count, key) - - -# Note these two functions are quite similar; if a third copy is needed, find a -# way to de-duplicate -def export_database_as_html_files(database, options): - os.makedirs(options.output, exist_ok=True) - seen_file_names = set() - counts = {'unnamed': 0, 'duplicate': 0} - for page in database.children: - if page.filename: - if page.filename not in seen_file_names: - seen_file_names.add(page.filename) - with open(os.path.join(options.output, f"{page.filename}.html"), 'w') as f: - f.write(page.to_html()) + database = client.get_database(export['id']) + if database is None: + msg = ( + "Unable to find database with id '%s'. " + "Perhaps the integration doesn't have permission to access this page?" + ) + logger.error(msg, export['id']) + continue + if node_type == "database_as_yaml": + result = database_to_yaml( + database=database, + pandoc_format=export["pandoc_format"], + pandoc_options=export["pandoc_options"], + id_property=export["id_property"], + url_property=export["url_property"], + content_property=export["content_property"], + notion_filter=export["notion_filter"], + notion_sorts=export["notion_sorts"], + ) + with open(export["output"], "w") as f: + f.write(result) + elif node_type == "database_as_files": + database_to_markdown_files( + database=database, + directory=export["output"], + pandoc_format=export["pandoc_format"], + pandoc_options=export["pandoc_options"], + filename_property=export["filename_property"], + notion_filter=export["notion_filter"], + notion_sorts=export["notion_sorts"], + id_property=export["id_property"], + url_property=export["url_property"], + ) else: - logger.warning('Skipping page named "%s" since it has been used', page.filename) - counts['duplicate'] += 1 - else: - counts['unnamed'] += 1 - for key, count in counts.items(): - if count > 0: - logger.info("%d %s page(s) skipped", count, key) - - -def export_related_databases(seed_database, options): - os.makedirs(options.output, exist_ok=True) - - seen_database_ids = set() - seen_file_names = set() - - def _export_related_databases(database): - seen_database_ids.add(database.notion_id) - if database.filename not in seen_file_names: - seen_file_names.add(database.filename) - with open(os.path.join(options.output, f"{database.filename}.yml"), 'w') as f: - f.write(database.to_yaml()) - else: - logger.warning('Database name "%s" has been used', database.filename) - for database_id in database.related_database_ids: - if database_id not in seen_database_ids: - try: - related_database = database.client.get_database(database_id) - _export_related_databases(related_database) - except APIResponseError as err: - if err.code == APIErrorCode.ObjectNotFound: - msg = 'Skipping database with id "%s" due to lack of permissions' - logger.warning(msg, database_id) - else: - raise err - - _export_related_databases(seed_database) + logger.error("Unknown node_type '%s'", node_type) + return 0 if __name__ == "__main__": diff --git a/n2y/notion.py b/n2y/notion.py index a163d3e3..a87f722b 100644 --- a/n2y/notion.py +++ b/n2y/notion.py @@ -23,7 +23,6 @@ from n2y.user import User from n2y.rich_text import DEFAULT_RICH_TEXTS, RichTextArray from n2y.mentions import DEFAULT_MENTIONS -from n2y.utils import strip_hyphens DEFAULT_NOTION_CLASSES = { @@ -49,10 +48,9 @@ class Client: """ An instance of the client class has a few purposes: - 1. To store configuration - 2. To retrieve data from Notion - 3. To determine what classes to use to wrap this notion data, based on the configuration - 4. To act as a shared global store for all of the objects that are pulled + 1. To retrieve data from Notion + 2. To determine what classes to use to wrap this notion data + 3. To act as a shared global store for all of the objects that are pulled from Notion. In particular there is a cache of all pages and databases which ensure that @@ -66,20 +64,10 @@ def __init__( media_root='.', media_url='', plugins=None, - content_property=None, - id_property=None, - url_property=None, - filename_property=None, - database_config=None, ): self.access_token = access_token self.media_root = media_root self.media_url = media_url - self.content_property = content_property - self.id_property = id_property - self.url_property = url_property - self.filename_property = filename_property - self.database_config = database_config if database_config is not None else {} self.base_url = "https://api.notion.com/v1/" self.headers = { @@ -258,14 +246,18 @@ def get_database(self, database_id): self.databases_cache[database_id] = database return database - def get_database_pages(self, database_id): - notion_pages = self.get_database_notion_pages(database_id) + def get_database_pages(self, database_id, filter=None, sorts=None): + notion_pages = self.get_database_notion_pages(database_id, filter, sorts) return [self._wrap_notion_page(np) for np in notion_pages] - def get_database_notion_pages(self, database_id): + def get_database_notion_pages(self, database_id, filter, sorts): results = [] url = f"{self.base_url}databases/{database_id}/query" - request_data = self._create_database_request_data(database_id) + request_data = {} + if filter: + request_data["filter"] = filter + if sorts: + request_data["sorts"] = sorts while True: data = self._post_url(url, request_data) results.extend(data["results"]) @@ -274,10 +266,6 @@ def get_database_notion_pages(self, database_id): else: request_data["start_cursor"] = data["next_cursor"] - def _create_database_request_data(self, database_id): - stripped_database_id = strip_hyphens(database_id) - return self.database_config.get(stripped_database_id, {}) - def get_page(self, page_id): """ Retrieve the page if its not in the cache. @@ -387,7 +375,7 @@ def save_file(self, content_iterator, page, extension): num_hash_characters = 8 # just long enough to avoid collisions hash = hash_md5.hexdigest()[:num_hash_characters] - relative_filepath = "".join([page.filename, "-", hash, extension]) + relative_filepath = "".join([hash, extension]) full_filepath = path.join(self.media_root, relative_filepath) makedirs(path.dirname(full_filepath), exist_ok=True) diff --git a/n2y/page.py b/n2y/page.py index 1118bb4c..bc6e7a88 100644 --- a/n2y/page.py +++ b/n2y/page.py @@ -1,9 +1,8 @@ import logging -import yaml from .blocks import ChildDatabaseBlock, ChildPageBlock -from n2y.utils import pandoc_ast_to_html, pandoc_ast_to_markdown, fromisoformat, sanitize_filename +from n2y.utils import fromisoformat from n2y.property_values import TitlePropertyValue @@ -83,74 +82,8 @@ def parent(self): assert parent_type == "database_id" return self.client.get_database(self.notion_parent["database_id"]) - @property - def filename(self): - # TODO: switch to using the database's natural keys as the file names - filename_property = self.client.filename_property - if filename_property is None: - return sanitize_filename(self.title.to_plain_text()) - elif filename_property in self.properties: - return sanitize_filename(self.properties[filename_property].to_value()) - else: - logger.warning( - 'Invalid filename property, "%s". Valid options are %s', - filename_property, ", ".join(self.properties.keys()), - ) - return sanitize_filename(self.title.to_plain_text()) - def to_pandoc(self): return self.block.to_pandoc() - def content_to_markdown(self): - pandoc_ast = self.to_pandoc() - if pandoc_ast is not None: - return pandoc_ast_to_markdown(pandoc_ast) - else: - return None - def properties_to_values(self): - properties = {k: v.to_value() for k, v in self.properties.items()} - - id_property = self.client.id_property - if id_property in properties: - logger.warning( - 'The id property "%s" is shadowing an existing ' - 'property with the same name', id_property, - ) - if id_property: - notion_id = self.notion_id - properties[id_property] = notion_id - - url_property = self.client.url_property - if url_property in properties: - logger.warning( - 'The url property "%s" is shadowing an existing ' - 'property with the same name', url_property, - ) - if url_property: - properties[url_property] = self.notion_url - return properties - - def to_markdown(self): - return '\n'.join([ - '---', - yaml.dump(self.properties_to_values()) + '---', - self.content_to_markdown() or '', - ]) - - def content_to_html(self): - pandoc_ast = self.to_pandoc() - if pandoc_ast is not None: - return pandoc_ast_to_html(pandoc_ast) - else: - return '' - - def to_html(self): - # currently, the html output is generated for jekyll sites, hence the - # inclusion of the YAML front matter - # if someone needs just the HTML we should generalize - return '\n'.join([ - '---', - yaml.dump(self.properties_to_values()) + '---', - self.content_to_html() or '', - ]) + return {k: v.to_value() for k, v in self.properties.items()} diff --git a/n2y/utils.py b/n2y/utils.py index 0ef08f17..f37ffd29 100644 --- a/n2y/utils.py +++ b/n2y/utils.py @@ -69,6 +69,8 @@ def pandoc_ast_to_html(pandoc_ast): def pandoc_write_or_log_errors(pandoc_ast, format, options): + if pandoc_ast is None or pandoc_ast == []: + return "" try: # TODO: add a mechanism to customize this return pandoc.write(pandoc_ast, format=format, options=options) diff --git a/tests/test_audit_end_to_end.py b/tests/test_audit_end_to_end.py index fe89657b..c5e87d5d 100644 --- a/tests/test_audit_end_to_end.py +++ b/tests/test_audit_end_to_end.py @@ -7,17 +7,13 @@ def run_n2yaudit(arguments): old_stdout = sys.stdout - old_stderr = sys.stderr sys.stdout = StringIO() - sys.stderr = StringIO() try: status = main(arguments, NOTION_ACCESS_TOKEN) stdout = sys.stdout.getvalue() - stderr = sys.stderr.getvalue() finally: sys.stdout = old_stdout - sys.stderr = old_stderr - return status, stdout, stderr + return status, stdout def test_audit(): @@ -26,7 +22,7 @@ def test_audit(): https://fresh-pencil-9f3.notion.site/Audited-cfa8ff07bba244c8b967c9b6a7a954c1 ''' object_id = 'cfa8ff07bba244c8b967c9b6a7a954c1' - status, stdoutput, _ = run_n2yaudit([object_id]) + status, stdoutput = run_n2yaudit([object_id]) assert status == 3 external_mention_in_top_page = \ diff --git a/tests/test_config.py b/tests/test_config.py index 125e6b5a..a30f21b8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,14 +2,14 @@ import yaml from n2y.config import ( - _valid_id, validate_config, merge_config, load_config, _valid_notion_filter, - _validate_config_item, MASTER_DEFAULTS + _valid_id, merge_config, load_config, _valid_notion_filter, + _validate_config_item, EXPORT_DEFAULTS ) from n2y.notion_mocks import mock_id def mock_config_item(node_type): - config_item = copy.deepcopy(MASTER_DEFAULTS) + config_item = copy.deepcopy(EXPORT_DEFAULTS) config_item["id"] = mock_id() config_item["node_type"] = node_type return config_item @@ -31,11 +31,13 @@ def test_load_config_basic(tmp_path): { "id": export_id, "node_type": "page", + "output": "output.md", "pandoc_format": "gfm", } ] })) config = load_config(config_path) + assert config is not None, "The config is invalid" merged_export = config["exports"][0] assert merged_export["id"] == export_id assert merged_export["node_type"] == "page" diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index 90355c36..c364b975 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -1,11 +1,9 @@ -import sys -import json from os import listdir import os.path from os.path import isfile, join -from io import StringIO import yaml +import pytest try: from yaml import CLoader as Loader @@ -16,56 +14,92 @@ from n2y.main import main -def run_n2y(arguments): - old_stdout = sys.stdout - old_stderr = sys.stderr - sys.stdout = StringIO() - sys.stderr = StringIO() +def run_n2y(temp_dir, config): + config_path = os.path.join(temp_dir, "config.yaml") + with open(config_path, "w") as f: + yaml.dump(config, f) + old_cwd = os.getcwd() + os.chdir(temp_dir) try: - status = main(arguments, NOTION_ACCESS_TOKEN) - stdout = sys.stdout.getvalue() - stderr = sys.stderr.getvalue() + status = main([config_path], NOTION_ACCESS_TOKEN) finally: - sys.stdout = old_stdout - sys.stderr = old_stderr - return status, stdout, stderr + os.chdir(old_cwd) + return status + + +def run_n2y_page(temp_dir, page_id, **export_config_keys): + config = { + "exports": [ + { + "id": page_id, + "node_type": "page", + "output": "page.md", + **export_config_keys, + } + ] + } + status = run_n2y(temp_dir, config) + assert status == 0 + with open(str(temp_dir / "page.md"), "r") as f: + page_as_markdown = f.read() + return page_as_markdown + + +def run_n2y_database_as_yaml(temp_dir, database_id, **export_config_keys): + config = { + "exports": [ + { + "id": database_id, + "node_type": "database_as_yaml", + "output": "database.yml", + **export_config_keys, + } + ] + } + status = run_n2y(temp_dir, config) + assert status == 0 + with open(str(temp_dir / "database.yml"), "r") as f: + unsorted_database = yaml.load(f, Loader=Loader) + return unsorted_database + + +def run_n2y_database_as_files(temp_dir, database_id, **export_config_keys): + config = { + "exports": [ + { + "id": database_id, + "node_type": "database_as_files", + "output": "database", + **export_config_keys, + } + ] + } + status = run_n2y(temp_dir, config) + assert status == 0 + return os.path.join(temp_dir, "database") -def test_simple_database_to_yaml(): +def test_simple_database_to_yaml(tmpdir): """ The database can be seen here: https://fresh-pencil-9f3.notion.site/176fa24d4b7f4256877e60a1035b45a4 """ object_id = "176fa24d4b7f4256877e60a1035b45a4" - status, stdoutput, _ = run_n2y( - [ - object_id, - "--output", - "yaml", - "--content-property", - "Content", - ] - ) - assert status == 0 - unsorted_database = yaml.load(stdoutput, Loader=Loader) + unsorted_database = run_n2y_database_as_yaml(tmpdir, object_id, content_property="Content") database = sorted(unsorted_database, key=lambda row: row["Name"]) assert len(database) == 3 assert database[0]["Name"] == "A" assert database[0]["Tags"] == ["a", "b"] assert database[0]["Content"] is None - assert database[0]["id"] is not None - assert database[0]["url"] is not None -def test_big_database_to_yaml(): +def test_big_database_to_yaml(tmpdir): """ The database can be seen here: https://fresh-pencil-9f3.notion.site/9341a0ddf7d4442d94ad74e5100f72af """ object_id = "9341a0ddf7d4442d94ad74e5100f72af" - status, stdoutput, _ = run_n2y([object_id, "--output", "yaml"]) - assert status == 0 - database = yaml.load(stdoutput, Loader=Loader) + database = run_n2y_database_as_yaml(tmpdir, object_id) assert len(database) == 101 @@ -75,138 +109,73 @@ def test_simple_database_to_markdown_files(tmpdir): https://fresh-pencil-9f3.notion.site/176fa24d4b7f4256877e60a1035b45a4 """ object_id = "176fa24d4b7f4256877e60a1035b45a4" - status, _, _ = run_n2y( - [ - object_id, - "--format", - "markdown", - "--output", - str(tmpdir), - ] - ) - assert status == 0 - generated_files = {f for f in listdir(tmpdir) if isfile(join(tmpdir, f))} + output_directory = run_n2y_database_as_files(tmpdir, object_id, filename_property="Name") + generated_files = {f for f in listdir(output_directory) if isfile(join(output_directory, f))} assert generated_files == {"A.md", "B.md", "C.md"} - document_as_markdown = open(join(tmpdir, "A.md"), "r").read() - metadata = parse_yaml_front_matter(document_as_markdown) + document = open(join(output_directory, "A.md"), "r").read() + metadata = parse_yaml_front_matter(document) assert metadata["Name"] == "A" assert metadata["Tags"] == ["a", "b"] assert "content" not in metadata -def test_simple_database_config(): +def test_simple_database_config(tmpdir): """ The database can be seen here: https://fresh-pencil-9f3.notion.site/176fa24d4b7f4256877e60a1035b45a4 """ database_id = "176fa24d4b7f4256877e60a1035b45a4" - database_config = { - database_id: { - "sorts": [ - { - "property": "Name", - "direction": "descending", - } - ], - "filter": { - "or": [ - {"property": "Name", "rich_text": {"contains": "A"}}, - {"property": "Name", "rich_text": {"contains": "C"}}, - ] - }, + notion_sorts = [ + { + "property": "Name", + "direction": "descending", } - } - status, stdoutput, _ = run_n2y( - [ - database_id, - "--database-config", - json.dumps(database_config), + ] + notion_filter = { + "or": [ + {"property": "Name", "rich_text": {"contains": "A"}}, + {"property": "Name", "rich_text": {"contains": "C"}}, ] + } + database = run_n2y_database_as_yaml( + tmpdir, database_id, + notion_sort=notion_sorts, notion_filter=notion_filter, ) - assert status == 0 - database = yaml.load(stdoutput, Loader=Loader) assert len(database) == 2 assert database[0]["Name"] == "C" assert database[1]["Name"] == "A" -def test_simple_related_databases(tmpdir): - """ - The page can be seen here: - https://fresh-pencil-9f3.notion.site/Simple-Related-Databases-7737303365434ee6b699786c110830a2 - """ - object_id = "6cc54e2b49994787927c24a9ac3d4676" - status, _, _ = run_n2y( - [ - object_id, - "--format", - "yaml-related", - "--output", - str(tmpdir), - ] - ) - assert status == 0 - generated_files = {f for f in listdir(tmpdir) if isfile(join(tmpdir, f))} - assert generated_files == {"A.yml", "B.yml", "C.yml"} - - -def test_unshared_related_databases(tmpdir): - """ - The page can be seen here: - https://fresh-pencil-9f3.notion.site/bc86b1692c2e4b7d991d7e6f6cacac54?v=cb6887a78ddd41f1a8a75385f7a40d47 - """ - object_id = "bc86b1692c2e4b7d991d7e6f6cacac54" - status, _, stderr = run_n2y( - [ - object_id, - "--format", - "yaml-related", - "--output", - str(tmpdir), - ] - ) - assert status == 0 - generated_files = {f for f in listdir(tmpdir) if isfile(join(tmpdir, f))} - assert generated_files == {"Database_with_Relationship_to_Unshared_Database.yml"} - # TODO: add an assertion that checks that warnings were displayed in stderr - # (at the moment, they don't appear to be because the related pages simply - # don't show up at all) - - -def test_all_properties_database(): +def test_all_properties_database(tmpdir): """ The page can be seen here: https://fresh-pencil-9f3.notion.site/53b9fa3da3f348e7ba3346254f1c722f """ object_id = "53b9fa3da3f348e7ba3346254f1c722f" - status, stdoutput, _ = run_n2y([object_id, "--output", "yaml"]) - assert status == 0 - unsorted_database = yaml.load(stdoutput, Loader=Loader) - assert len(unsorted_database) == 4 + database = run_n2y_database_as_yaml(tmpdir, object_id) + assert len(database) == 4 -def test_mention_in_simple_table(tmp_path): +def test_mention_in_simple_table(tmpdir): ''' The page can be seen here: https://fresh-pencil-9f3.notion.site/Simple-Table-with-Mention-Test-e12497428b0e43c3b14e016de6c5a2cf ''' object_id = 'e12497428b0e43c3b14e016de6c5a2cf' - _, document_as_markdown, _ = run_n2y([object_id, '--media-root', str(tmp_path)]) - assert "In Table: Simple Test Page" in document_as_markdown - assert "Out of Table: Simple Test Page" in document_as_markdown + document = run_n2y_page(tmpdir, object_id) + assert "In Table: Simple Test Page" in document + assert "Out of Table: Simple Test Page" in document -def test_all_blocks_page_to_markdown(tmp_path): +def test_all_blocks_page_to_markdown(tmpdir): """ The page can be seen here: https://fresh-pencil-9f3.notion.site/Test-Page-5f18c7d7eda44986ae7d938a12817cc0 """ object_id = "5f18c7d7eda44986ae7d938a12817cc0" - status, document_as_markdown, stderr = run_n2y( - [object_id, "--media-root", str(tmp_path)] - ) - lines = document_as_markdown.split("\n") - metadata = parse_yaml_front_matter(document_as_markdown) + document = run_n2y_page(tmpdir, object_id) + lines = document.split("\n") + metadata = parse_yaml_front_matter(document) assert metadata["title"] == "All Blocks Test Page" column_string = ( '

Column 1

' @@ -220,8 +189,6 @@ def test_all_blocks_page_to_markdown(tmp_path): "" in lines, ] - # TODO: look into why there's extra space in between the list entries - assert status == 0 assert "Text block" in lines assert "Text *italics* too" in lines assert "- [ ] To do list block" in lines @@ -236,7 +203,7 @@ def test_all_blocks_page_to_markdown(tmp_path): assert "---" in lines assert "Callout block" in lines assert "$e^{-i \\pi} = -1$" in lines - assert "``` javascript\nCode Block\n```" in document_as_markdown + assert "``` javascript\nCode Block\n```" in document assert lines.count("This is a synced block.") == 2 assert "This is a synced block from another page." in lines @@ -251,13 +218,13 @@ def test_all_blocks_page_to_markdown(tmp_path): assert "[Bookmark caption](https://innolitics.com)" in lines # the word "caption" is bolded - assert "![Image **caption**](All_Blocks_Test_Page-5c264631.jpeg)" in lines + assert "![Image **caption**](media/5c264631.jpeg)" in lines # from a file block in the Notion page - assert os.path.exists(tmp_path / "All_Blocks_Test_Page-5c264631.jpeg") + assert os.path.exists(tmpdir / "media" / "5c264631.jpeg") -def test_page_in_database_to_markdown(): +def test_page_in_database_to_markdown(tmpdir): """ This test exports a single page, or "row", that is in a database. Unlike pages that are not in a database, who only have a single "Title" property, @@ -268,53 +235,41 @@ def test_page_in_database_to_markdown(): https://fresh-pencil-9f3.notion.site/C-7e967a44893f4b25917965896e81c137 """ object_id = "7e967a44893f4b25917965896e81c137" - _, document_as_markdown, _ = run_n2y([object_id]) - lines = document_as_markdown.split("\n") - metadata = parse_yaml_front_matter(document_as_markdown) + document = run_n2y_page(tmpdir, object_id) + lines = document.split("\n") + metadata = parse_yaml_front_matter(document) assert metadata["Name"] == "C" assert metadata["Tags"] == ["d", "a", "b", "c"] assert "content" not in metadata assert "Has some basic content" in lines -def test_simple_page_to_markdown(): +def test_simple_page_to_markdown(tmpdir): """ The page can be seen here: https://fresh-pencil-9f3.notion.site/Simple-Test-Page-6670dc17a7bc4426b91bca4cf3ac5623 """ object_id = "6670dc17a7bc4426b91bca4cf3ac5623" - status, document_as_markdown, _ = run_n2y([object_id]) - assert status == 0 - assert "Page content" in document_as_markdown + document = run_n2y_page(tmpdir, object_id) + assert "Page content" in document -def test_builtin_plugins(tmp_path): +@pytest.mark.xfail(reason="Need to make it possible to enable/disable plugins") +def test_builtin_plugins(tmpdir): """ The page can be seen here: https://fresh-pencil-9f3.notion.site/Plugins-Test-96d71e2876eb47b285833582e8cf27eb """ object_id = "96d71e2876eb47b285833582e8cf27eb" - status, document_as_markdown, _ = run_n2y( - [ - object_id, - "--plugin", - "n2y.plugins.deepheaders", - "--plugin", - "n2y.plugins.removecallouts", - "--plugin", - "n2y.plugins.rawcodeblocks", - "--plugin", - "n2y.plugins.mermaid", - "--plugin", - "n2y.plugins.footnotes", - "--plugin", - "n2y.plugins.expandlinktopages", - "--media-root", - str(tmp_path), - ] - ) - assert status == 0 - lines = document_as_markdown.split("\n") + document = run_n2y_page(tmpdir, object_id, plugins=[ + "n2y.plugins.deepheaders", + "n2y.plugins.removecallouts", + "n2y.plugins.rawcodeblocks", + "n2y.plugins.mermaid", + "n2y.plugins.footnotes", + "n2y.plugins.expandlinktopages", + ]) + lines = document.split("\n") assert "#### H4" in lines assert "##### H5" in lines assert not any("should disappear" in l for l in lines) @@ -336,17 +291,12 @@ def test_builtin_plugins(tmp_path): # assert "[^2]: Second footnote" in lines # The word "Bulletlist" only shows up in the linked page that is expanded - assert "Bulletlist" in document_as_markdown + assert "Bulletlist" in document # Ensure a link to page to an unshared page doesn't get expanded; note that # Notion will actually represent these pages as an "UnsupportedBlock" (which # is odd). This will throw a warning and won't produce any content though, # which is the desired behavior. - assert "Untitled" not in document_as_markdown - assert "Unshared Page" not in document_as_markdown - assert "This page is not shared with the integration." not in document_as_markdown - - -def test_missing_object_exception(): - invalid_page_id = "11111111111111111111111111111111" - assert run_n2y([invalid_page_id]) != 0 + assert "Untitled" not in document + assert "Unshared Page" not in document + assert "This page is not shared with the integration." not in document From 773d38f99af9dcc35cb8dd23b8d9beeebb8c4b97 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Fri, 30 Sep 2022 16:04:41 -0400 Subject: [PATCH 04/14] Reload all plugins for each export This solution is simple, although it may have perforamance issues. Note that the `importlib.import_module` uses the internal python module cache. --- README.md | 7 +++++-- n2y/config.py | 2 +- n2y/main.py | 9 ++------- n2y/notion.py | 4 ++-- tests/test_end_to_end.py | 2 -- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 08e790bf..6a315c5d 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ The default implementation of these classes can be modified using a plugin syste 1. Create a new Python module 2. Subclass the various notion classes, modifying their constructor or `to_pandoc` method as desired -3. Run n2y with the `--plugin` argument pointing to your python module +3. Set the `plugins` property in your export config to the module name (e.g., `n2y.plugins.deepheaders`) See the [builtin plugins](https://github.com/innolitics/n2y/tree/main/n2y/plugins) for examples. @@ -137,6 +137,10 @@ You can use multiple plugins. If two plugins provide classes for the same notion Often you'll want to use a different class only in certain situations. For example, you may want to use a different Page class with its own unique behavior only for pages in a particular database. To accomplish this you can use the `n2y.errors.UseNextClass` exception. If your plugin class raise the `n2y.errors.UseNextClass` exception in its constructor, then n2y will move on to the next class (which may be the builtin class if only one plugin was used). +### Different Plugins for Different Exports + +You may use different plugins for different export items, but keep in mind that the plugin module is imported only once. Also, if you export the same `Page` or `Database` multiple times with different plugins, due to an internal cache, the plugins that were enabled during the first run will be used. + ### Default Block Class's Here are the default block classes that can be extended: @@ -174,7 +178,6 @@ Here are the default block classes that can be extended: | ToggleBlock | Convert the toggles into a bulleted list. | | VideoBlock | Acts the same way as the Image block | - Most of the Notion blocks can generate their pandoc AST from _only_ their own data. The one exception is the list item blocks; pandoc, unlike Notion, has an encompassing node in the AST for the entire list. The `ListItemBlock.list_to_pandoc` class method is responsible for generating this top-level node. ## Built-in Plugins diff --git a/n2y/config.py b/n2y/config.py index 2c123ed3..23256fd6 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -12,7 +12,6 @@ DEFAULTS = { "media_root": "media", "media_url": "./media/", - "plugins": [], } @@ -27,6 +26,7 @@ '--wrap', 'none', # don't hard line-wrap '--eol', 'lf', # use linux-style line endings ], + "plugins": [], } diff --git a/n2y/main.py b/n2y/main.py index 5f415bba..95cd83f4 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -47,15 +47,10 @@ def main(raw_args, access_token): if config is None: return 2 - client = notion.Client( - access_token, - config["media_root"], - config["media_url"], - config["plugins"], - ) + client = notion.Client(access_token, config["media_root"], config["media_url"]) for export in config['exports']: - # TODO: swap out plugins + client.load_plugins(export["plugins"]) node_type = export["node_type"] if node_type == "page": page = client.get_page(export['id']) diff --git a/n2y/notion.py b/n2y/notion.py index a87f722b..8f8e85d3 100644 --- a/n2y/notion.py +++ b/n2y/notion.py @@ -79,7 +79,6 @@ def __init__( self.databases_cache = {} self.pages_cache = {} - self.notion_classes = self.get_default_classes() self.load_plugins(plugins) self.plugin_data = {} @@ -93,6 +92,7 @@ def get_default_classes(self): return notion_classes def load_plugins(self, plugins): + self.notion_classes = self.get_default_classes() if plugins is not None: for plugin in plugins: plugin_module = importlib.import_module(plugin) @@ -276,7 +276,7 @@ def get_page(self, page_id): try: notion_page = self._get_url(f"{self.base_url}pages/{page_id}") except ObjectNotFound: - self.pages_cache[page_id] = None + self.pages_cache[(page_id, self.active_plugins)] = None return # _wrap_notion_page will add the page to the cache page = self._wrap_notion_page(notion_page) diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index c364b975..fb8d377f 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -3,7 +3,6 @@ from os.path import isfile, join import yaml -import pytest try: from yaml import CLoader as Loader @@ -254,7 +253,6 @@ def test_simple_page_to_markdown(tmpdir): assert "Page content" in document -@pytest.mark.xfail(reason="Need to make it possible to enable/disable plugins") def test_builtin_plugins(tmpdir): """ The page can be seen here: From f974c7b8f4961eb5b07286b7c71d38828f4b524c Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Fri, 30 Sep 2022 16:42:32 -0400 Subject: [PATCH 05/14] Add support for remapping property names --- README.md | 2 ++ n2y/config.py | 2 ++ n2y/export.py | 30 ++++++++++++++++++++++++------ n2y/main.py | 3 +++ n2y/notion_mocks.py | 21 ++++++++++++++------- tests/test_export.py | 33 +++++++++++++++++++++++++++++++++ 6 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 tests/test_export.py diff --git a/README.md b/README.md index 6a315c5d..86f010ba 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ The export configuration items may contain the following keys: | plugins | A list of python modules to use as plugins. | | notion_filter | A [notion filter object](https://developers.notion.com/reference/post-database-query-filter) to be applied to the database. | | notion_sorts | A [notion sorts object](https://developers.notion.com/reference/post-database-query-sort) to be applied to the database. | +| property_map | A mapping between the name of properties in Notion, and the name of the properties in the exported files. | ## Example Configuration Files @@ -272,6 +273,7 @@ Here are some features we're planning to add in the future: - The export is now configured using a single YAML file instead of the growing list of commandline arguments. Using a configuration file allows multiple page and database exports to be made in a single run, which in turn improves caching and will enable future improvements, like preserving links between generated HTML or markdown pages. - Added the `pandoc_format` and `pandoc_options` fields, making it possible to output to any format that pandoc supports. - Removed the ability to export a set of related databases (this is less useful now that we have a configuration file). +- Add support for remapping property names in the exports using the `property_map` option ### v0.5.0 diff --git a/n2y/config.py b/n2y/config.py index 23256fd6..c0db2928 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -27,6 +27,7 @@ '--eol', 'lf', # use linux-style line endings ], "plugins": [], + "property_map": {}, } @@ -111,6 +112,7 @@ def _validate_config_item(config_item): return False # TODO: validate pandoc_formation # TODO: validate pandoc_options + # TODO: property map return True diff --git a/n2y/export.py b/n2y/export.py index 3207fef4..ad01551f 100644 --- a/n2y/export.py +++ b/n2y/export.py @@ -12,7 +12,9 @@ logger = logging.getLogger(__name__) -def _page_properties(page, id_property, url_property): +def _page_properties(page, id_property=None, url_property=None, property_map=None): + if property_map is None: + property_map = {} properties = page.properties_to_values() if id_property in properties: logger.warning( @@ -29,11 +31,24 @@ def _page_properties(page, id_property, url_property): ) if url_property: properties[url_property] = page.notion_url + for original, new in property_map.items(): + if original in properties: + properties[new] = properties.pop(original) + else: + msg = "Property %s not found in page %s; skipping remapping from %s to %s" + logger.warning(msg, original, page.notion_url, original, new) return properties -def export_page(page, pandoc_format, pandoc_options, id_property=None, url_property=None): - page_properties = _page_properties(page, id_property, url_property) +def export_page( + page, + pandoc_format, + pandoc_options, + id_property=None, + url_property=None, + property_map=None, +): + page_properties = _page_properties(page, id_property, url_property, property_map) pandoc_ast = page.to_pandoc() page_content = pandoc_write_or_log_errors(pandoc_ast, pandoc_format, pandoc_options) return '\n'.join([ @@ -47,11 +62,12 @@ def database_to_yaml( database, pandoc_format, pandoc_options, + notion_filter=None, + notion_sorts=None, id_property=None, url_property=None, content_property=None, - notion_filter=None, - notion_sorts=None, + property_map=None, ): if content_property in database.schema: logger.warning( @@ -60,7 +76,7 @@ def database_to_yaml( ) results = [] for page in database.children_filtered(notion_filter, notion_sorts): - result = _page_properties(page, id_property, url_property) + result = _page_properties(page, id_property, url_property, property_map) if content_property: pandoc_ast = page.to_pandoc() if pandoc_ast: @@ -83,6 +99,7 @@ def database_to_markdown_files( notion_sorts=None, id_property=None, url_property=None, + property_map=None, ): os.makedirs(directory, exist_ok=True) seen_file_names = set() @@ -99,6 +116,7 @@ def database_to_markdown_files( pandoc_options, id_property, url_property, + property_map, ) f.write(document) else: diff --git a/n2y/main.py b/n2y/main.py index 95cd83f4..0174a4a0 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -67,6 +67,7 @@ def main(raw_args, access_token): export["pandoc_options"], export["id_property"], export["url_property"], + export["property_map"], ) with open(export["output"], "w") as f: f.write(result) @@ -89,6 +90,7 @@ def main(raw_args, access_token): content_property=export["content_property"], notion_filter=export["notion_filter"], notion_sorts=export["notion_sorts"], + property_map=export["property_map"], ) with open(export["output"], "w") as f: f.write(result) @@ -103,6 +105,7 @@ def main(raw_args, access_token): notion_sorts=export["notion_sorts"], id_property=export["id_property"], url_property=export["url_property"], + property_map=export["property_map"], ) else: logger.error("Unknown node_type '%s'", node_type) diff --git a/n2y/notion_mocks.py b/n2y/notion_mocks.py index fa1efd93..677ad3ec 100644 --- a/n2y/notion_mocks.py +++ b/n2y/notion_mocks.py @@ -17,7 +17,10 @@ def mock_person_user(name, email): def mock_rich_text_array(text_blocks_descriptors): - return [mock_rich_text(t, a) for t, a in text_blocks_descriptors] + if isinstance(text_blocks_descriptors, str): + return [mock_rich_text(text_blocks_descriptors, [])] + else: + return [mock_rich_text(t, a) for t, a in text_blocks_descriptors] def mock_rich_text(text, annotations=None, href=None, mention=None): @@ -99,7 +102,7 @@ def mock_block(block_type, content, has_children=False, **kwargs): def mock_paragraph_block(text_blocks_descriptors, **kwargs): return mock_block('paragraph', { 'color': 'default', - 'rich_text': [mock_rich_text(t, a) for t, a in text_blocks_descriptors], + 'rich_text': mock_rich_text_array(text_blocks_descriptors), }, **kwargs) @@ -118,6 +121,10 @@ def mock_property_value(property_value_type, content): } +def mock_rich_text_property_value(text_blocks_descriptors): + return mock_property_value("rich_text", mock_rich_text_array(text_blocks_descriptors)) + + def mock_formula_property_value(formula_type, content): return mock_property_value("formula", { "type": formula_type, @@ -146,7 +153,9 @@ def mock_relation_value(): return {"id": mock_id()} -def mock_page(title="Mock Title"): +def mock_page(title="Mock Title", extra_properties=None): + if extra_properties is None: + extra_properties = {} user = mock_user() created_time = datetime.now().isoformat() notion_id = mock_id() @@ -166,10 +175,8 @@ def mock_page(title="Mock Title"): 'title': { 'id': 'title', 'type': 'title', - 'title': mock_rich_text_array([ - (title, []), - ]), - } + 'title': mock_rich_text_array(title) + }, **extra_properties, }, 'url': f'https://www.notion.so/{hyphenated_title}-{notion_id}', } diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 00000000..d0a8cb44 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,33 @@ +import pytest + +from n2y.export import _page_properties +from n2y.notion_mocks import mock_page, mock_rich_text_property_value +from n2y import notion + + +@pytest.fixture +def page(): + property = mock_rich_text_property_value("P") + notion_page = mock_page(title="T", extra_properties={"property": property}) + client = notion.Client('') + return client._wrap_notion_page(notion_page) + + +def test_page_properties_basic(page): + properties = _page_properties(page) + assert properties == {"title": "T", "property": "P"} + + +def test_page_properties_id(page): + properties = _page_properties(page, id_property="id") + assert "id" in properties + + +def test_page_properties_url(page): + properties = _page_properties(page, url_property="url") + assert "url" in properties + + +def test_page_properties_mapping(page): + properties = _page_properties(page, property_map={"property": "p"}) + assert properties == {"title": "T", "p": "P"} From 879c411605a6683323c6b557f18ad71fc938f2ef Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Fri, 30 Sep 2022 17:07:54 -0400 Subject: [PATCH 06/14] Fix bugs in merge --- n2y/notion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/n2y/notion.py b/n2y/notion.py index 638e11be..3cefc67d 100644 --- a/n2y/notion.py +++ b/n2y/notion.py @@ -20,7 +20,7 @@ from n2y.user import User from n2y.rich_text import DEFAULT_RICH_TEXTS, RichTextArray from n2y.mentions import DEFAULT_MENTIONS -from n2y.utils import sanitize_filename +from n2y.utils import sanitize_filename, strip_hyphens DEFAULT_NOTION_CLASSES = { @@ -250,7 +250,6 @@ def get_database_pages(self, database_id, filter=None, sorts=None): return [self._wrap_notion_page(np) for np in notion_pages] def get_database_notion_pages(self, database_id, filter, sorts): - results = [] url = f"{self.base_url}databases/{database_id}/query" request_data = {} if filter: @@ -363,7 +362,7 @@ def download_file(self, url, page): def save_file(self, content, page, extension): page_id_chars = strip_hyphens(page.notion_id) - page_title = sanitize_filename(page.title) + page_title = sanitize_filename(page.title.to_plain_text()) relative_filepath = f"{page_title}-{page_id_chars[:11]}{extension}" full_filepath = path.join(self.media_root, relative_filepath) makedirs(self.media_root, exist_ok=True) From 59ce791c5884f67470f34a5c983731ce368abbb1 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Fri, 30 Sep 2022 21:06:32 -0400 Subject: [PATCH 07/14] Fix accidentally committed code I'm not sure how the tests are passing --- n2y/notion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n2y/notion.py b/n2y/notion.py index 3cefc67d..147bd272 100644 --- a/n2y/notion.py +++ b/n2y/notion.py @@ -268,7 +268,7 @@ def get_page(self, page_id): try: notion_page = self._get_url(f"{self.base_url}pages/{page_id}") except ObjectNotFound: - self.pages_cache[(page_id, self.active_plugins)] = None + self.pages_cache[page_id] = None return # _wrap_notion_page will add the page to the cache page = self._wrap_notion_page(notion_page) From 0f6f4a58ea879431154eaa88f0dd3b8f5d6b34ee Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Fri, 30 Sep 2022 21:25:14 -0400 Subject: [PATCH 08/14] Add support for emojis --- README.md | 2 ++ n2y/emoji.py | 14 ++++++++++++++ n2y/notion.py | 5 +++++ n2y/page.py | 13 ++++++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 n2y/emoji.py diff --git a/README.md b/README.md index 86f010ba..59c31e5c 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ At the core of n2y are a set of python classes that represent the various parts | Mention | A reference to another Notion object (e.g., a page, database, block, user, etc. ) | User | A notion user; used in property values and in page, block, and database metadata | | File | A file | +| Emoji | An emoji | The `Property`, `PropertyValue`, `Block`, `RichText`, and `Mention` classes have subclasses that represent the various subtypes. E.g., there is a `ParagraphBlock` that represents paragraph. @@ -274,6 +275,7 @@ Here are some features we're planning to add in the future: - Added the `pandoc_format` and `pandoc_options` fields, making it possible to output to any format that pandoc supports. - Removed the ability to export a set of related databases (this is less useful now that we have a configuration file). - Add support for remapping property names in the exports using the `property_map` option +- Add basic support for emoji icons for pages. ### v0.5.0 diff --git a/n2y/emoji.py b/n2y/emoji.py new file mode 100644 index 00000000..ba58fe09 --- /dev/null +++ b/n2y/emoji.py @@ -0,0 +1,14 @@ +import logging + + +logger = logging.getLogger(__name__) + + +class Emoji: + """ + See https://developers.notion.com/reference/emoji-object + """ + + def __init__(self, client, notion_data): + self.client = client + self.emoji = notion_data['emoji'] diff --git a/n2y/notion.py b/n2y/notion.py index 147bd272..6f180f98 100644 --- a/n2y/notion.py +++ b/n2y/notion.py @@ -11,6 +11,7 @@ UseNextClass, is_api_error_code, APIErrorCode ) from n2y.file import File +from n2y.emoji import Emoji from n2y.page import Page from n2y.database import Database from n2y.comment import Comment @@ -31,6 +32,7 @@ "property_values": DEFAULT_PROPERTY_VALUES, "user": User, "file": File, + "emoji": Emoji, "rich_text_array": RichTextArray, "rich_texts": DEFAULT_RICH_TEXTS, "mentions": DEFAULT_MENTIONS, @@ -195,6 +197,9 @@ def wrap_notion_user(self, notion_data): def wrap_notion_file(self, notion_data): return self.instantiate_class("file", None, self, notion_data) + def wrap_notion_emoji(self, notion_data): + return self.instantiate_class("emoji", None, self, notion_data) + def wrap_notion_rich_text_array(self, notion_data, block=None): return self.instantiate_class("rich_text_array", None, self, notion_data, block) diff --git a/n2y/page.py b/n2y/page.py index bc6e7a88..ab54bf28 100644 --- a/n2y/page.py +++ b/n2y/page.py @@ -20,7 +20,7 @@ def __init__(self, client, notion_data): self.last_edited_time = fromisoformat(notion_data['last_edited_time']) self.last_edited_by = client.wrap_notion_user(notion_data['last_edited_by']) self.archived = notion_data['archived'] - self.icon = notion_data['icon'] and client.wrap_notion_file(notion_data['icon']) + self.emoji = self._init_icon(notion_data['icon']) self.cover = notion_data['cover'] and client.wrap_notion_file(notion_data['cover']) self.archived = notion_data['archived'] self.properties = { @@ -35,6 +35,17 @@ def __init__(self, client, notion_data): self.plugin_data = {} + def _init_icon(self, icon_notion_data): + """ + The icon property is unique in that it can be either an emoji or a file. + """ + if icon_notion_data is None: + return None + elif icon_notion_data["type"] == "emoji": + return self.client.wrap_notion_emoji(icon_notion_data) + else: + return self.client.wrap_notion_file(icon_notion_data) + @property def title(self): for property_value in self.properties.values(): From b3093742369cd79dc258f9a8f4d5497758301dc8 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Mon, 3 Oct 2022 10:22:13 -0400 Subject: [PATCH 09/14] Small logger update --- n2y/main.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/n2y/main.py b/n2y/main.py index 0174a4a0..7e997db6 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -27,11 +27,6 @@ def main(raw_args, access_token): help="Level to set the root logging module to", ) - # TODO: Add the ability to dump out a "schema" file that contains the schema - # for a set of databases - - # TODO: Add the ability to export everything as a sqlite file - args = parser.parse_args(raw_args) logging_level = logging.__dict__[args.verbosity] @@ -50,6 +45,7 @@ def main(raw_args, access_token): client = notion.Client(access_token, config["media_root"], config["media_url"]) for export in config['exports']: + logger.info("Exporting to %s", export['output']) client.load_plugins(export["plugins"]) node_type = export["node_type"] if node_type == "page": From 5f048fc9e27188f3e6e1e231188b16466bd3bef8 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Tue, 4 Oct 2022 10:46:03 -0400 Subject: [PATCH 10/14] Small improvements to error messages - Provide a link back to the database that you don't have permission to - Suggest switching nested databases into a simple table --- n2y/blocks.py | 9 ++++++--- n2y/main.py | 5 +++-- n2y/utils.py | 8 ++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/n2y/blocks.py b/n2y/blocks.py index e6586507..b58fe00c 100644 --- a/n2y/blocks.py +++ b/n2y/blocks.py @@ -491,8 +491,11 @@ def to_pandoc(self): return None -class ChildDatabaseBlock(WarningBlock): - pass +class ChildDatabaseBlock(NoopBlock): + def to_pandoc(self): + msg = 'Skipping unsupported "%s" block (%s). Perhaps you can convert the database into a simple table?' + logger.warning(msg, self.notion_type, self.notion_url) + return None class EmbedBlock(WarningBlock): @@ -518,7 +521,7 @@ def to_pandoc(self): return Para(content_ast) -class PdfBlock(WarningBlock): +class PdfBlock(Block): def __init__(self, client, notion_data, page, get_children=True): super().__init__(client, notion_data, page, get_children) self.pdf = client.wrap_notion_file(notion_data['pdf']) diff --git a/n2y/main.py b/n2y/main.py index 7e997db6..0403d326 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -6,6 +6,7 @@ from n2y import notion from n2y.export import export_page, database_to_yaml, database_to_markdown_files from n2y.config import load_config +from n2y.utils import share_link_from_id logger = None @@ -52,10 +53,10 @@ def main(raw_args, access_token): page = client.get_page(export['id']) if page is None: msg = ( - "Unable to find page with id '%s'. " + "Unable to find page with id '%s' (%s). " "Perhaps the integration doesn't have permission to access this page?" ) - logger.error(msg, export['id']) + logger.error(msg, export['id'], share_link_from_id(export['id'])) continue result = export_page( page, diff --git a/n2y/utils.py b/n2y/utils.py index f37ffd29..2658da60 100644 --- a/n2y/utils.py +++ b/n2y/utils.py @@ -125,5 +125,13 @@ def id_from_share_link(share_link): return query_removed[-32:] +def share_link_from_id(id): + # Note that ordinarily page links include a hyphenated titled, but + # fortunately they will redirect to the canonical page URL including the + # hyphenated title if you visit the link with only the UUID. Similarly, + # database urls often have a version parameter, but we can omit that too. + return f"https://www.notion.so/{id}" + + def strip_hyphens(string): return string.replace("-", "") From ba1b96a0fda8cdcf9ad1d0fa38249568b6200ee6 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Tue, 4 Oct 2022 10:50:29 -0400 Subject: [PATCH 11/14] Break up long string literal --- n2y/blocks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/n2y/blocks.py b/n2y/blocks.py index b58fe00c..e990d307 100644 --- a/n2y/blocks.py +++ b/n2y/blocks.py @@ -493,7 +493,10 @@ def to_pandoc(self): class ChildDatabaseBlock(NoopBlock): def to_pandoc(self): - msg = 'Skipping unsupported "%s" block (%s). Perhaps you can convert the database into a simple table?' + msg = ( + 'Skipping unsupported "%s" block (%s). ' + 'Perhaps you can convert the database into a simple table?' + ) logger.warning(msg, self.notion_type, self.notion_url) return None From a0a2b14ff90cfba83a86c80761c43abfac76cc3d Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Tue, 4 Oct 2022 14:04:54 -0400 Subject: [PATCH 12/14] Address PR comments --- n2y/config.py | 28 +++++++----- n2y/main.py | 119 +++++++++++++++++++++++++++----------------------- 2 files changed, 81 insertions(+), 66 deletions(-) diff --git a/n2y/config.py b/n2y/config.py index c0db2928..afcbc4c1 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -32,17 +32,8 @@ def load_config(path): - try: - with open(path, "r") as config_file: - config = yaml.safe_load(config_file) - except yaml.YAMLError as exc: - logger.error("Error parsing the config file: %s", exc) - return None - except FileNotFoundError: - logger.error("The config file '%s' does not exist", path) - return None - if not validate_config(config): - logger.error("Invalid config file: %s", path) + config = _load_config_from_yaml(path) + if config is None: return None defaults_copy = copy.deepcopy(DEFAULTS) @@ -57,6 +48,21 @@ def load_config(path): return config +def _load_config_from_yaml(path): + try: + with open(path, "r") as config_file: + config = yaml.safe_load(config_file) + except yaml.YAMLError as exc: + logger.error("Error parsing the config file: %s", exc) + return None + except FileNotFoundError: + logger.error("The config file '%s' does not exist", path) + return None + if not validate_config(config): + logger.error("Invalid config file: %s", path) + return None + + def merge_config(config_items, builtin_defaults, defaults): """ For each config item, merge in both the user provided defaults and the diff --git a/n2y/main.py b/n2y/main.py index 0403d326..00f5bf50 100644 --- a/n2y/main.py +++ b/n2y/main.py @@ -45,68 +45,77 @@ def main(raw_args, access_token): client = notion.Client(access_token, config["media_root"], config["media_url"]) + error_occurred = False for export in config['exports']: logger.info("Exporting to %s", export['output']) client.load_plugins(export["plugins"]) - node_type = export["node_type"] - if node_type == "page": - page = client.get_page(export['id']) - if page is None: - msg = ( - "Unable to find page with id '%s' (%s). " - "Perhaps the integration doesn't have permission to access this page?" - ) - logger.error(msg, export['id'], share_link_from_id(export['id'])) - continue - result = export_page( - page, - export["pandoc_format"], - export["pandoc_options"], - export["id_property"], - export["url_property"], - export["property_map"], + export_completed = _export_node_from_config(client, export) + if not export_completed: + error_occurred = True + return 0 if not error_occurred else 3 + + +def _export_node_from_config(client, export): + node_type = export["node_type"] + if node_type == "page": + page = client.get_page(export['id']) + if page is None: + msg = ( + "Unable to find page with id '%s' (%s). " + "Perhaps the integration doesn't have permission to access this page?" + ) + logger.error(msg, export['id'], share_link_from_id(export['id'])) + return False + result = export_page( + page, + export["pandoc_format"], + export["pandoc_options"], + export["id_property"], + export["url_property"], + export["property_map"], + ) + with open(export["output"], "w") as f: + f.write(result) + else: + database = client.get_database(export['id']) + if database is None: + msg = ( + "Unable to find database with id '%s' (%s). " + "Perhaps the integration doesn't have permission to access this database?" + ) + logger.error(msg, export['id'], share_link_from_id(export['id'])) + return False + if node_type == "database_as_yaml": + result = database_to_yaml( + database=database, + pandoc_format=export["pandoc_format"], + pandoc_options=export["pandoc_options"], + id_property=export["id_property"], + url_property=export["url_property"], + content_property=export["content_property"], + notion_filter=export["notion_filter"], + notion_sorts=export["notion_sorts"], + property_map=export["property_map"], ) with open(export["output"], "w") as f: f.write(result) + elif node_type == "database_as_files": + database_to_markdown_files( + database=database, + directory=export["output"], + pandoc_format=export["pandoc_format"], + pandoc_options=export["pandoc_options"], + filename_property=export["filename_property"], + notion_filter=export["notion_filter"], + notion_sorts=export["notion_sorts"], + id_property=export["id_property"], + url_property=export["url_property"], + property_map=export["property_map"], + ) else: - database = client.get_database(export['id']) - if database is None: - msg = ( - "Unable to find database with id '%s'. " - "Perhaps the integration doesn't have permission to access this page?" - ) - logger.error(msg, export['id']) - continue - if node_type == "database_as_yaml": - result = database_to_yaml( - database=database, - pandoc_format=export["pandoc_format"], - pandoc_options=export["pandoc_options"], - id_property=export["id_property"], - url_property=export["url_property"], - content_property=export["content_property"], - notion_filter=export["notion_filter"], - notion_sorts=export["notion_sorts"], - property_map=export["property_map"], - ) - with open(export["output"], "w") as f: - f.write(result) - elif node_type == "database_as_files": - database_to_markdown_files( - database=database, - directory=export["output"], - pandoc_format=export["pandoc_format"], - pandoc_options=export["pandoc_options"], - filename_property=export["filename_property"], - notion_filter=export["notion_filter"], - notion_sorts=export["notion_sorts"], - id_property=export["id_property"], - url_property=export["url_property"], - property_map=export["property_map"], - ) - else: - logger.error("Unknown node_type '%s'", node_type) - return 0 + logger.error("Unknown node_type '%s'", node_type) + return False + return True if __name__ == "__main__": From 3362f9746094d1438e1921c4927a67ba1ef6492b Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Tue, 4 Oct 2022 14:09:16 -0400 Subject: [PATCH 13/14] Add a bigger example to the README Note that the UUIDs are fake --- README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/README.md b/README.md index 59c31e5c..2bc32d16 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,63 @@ Sometimes it is useful to ensure that a root Notion page, and it's child-pages, n2yaudit PAGE_LINK ``` +### Bigger Example + +This example shows how you can use the `export_defaults` property to avoid duplicated configuration between export items. It also shows now you can use notion filters to export pages from the same database into two different directories. + +``` +media_root: "media" +media_url: "./media/" +export_defaults: + plugins: + - "n2y.plugins.mermaid" + - "n2y.plugins.rawcodeblocks" + - "n2y.plugins.removecallouts" + - "n2y.plugins.deepheaders" + - "n2y.plugins.expandlinktopages" + content_property: null + id_property: id + url_property: url +exports: + - output: "documents/dhf" + node_type: "database_as_files" + filename_property: "Name" + id: e24f839e724848d69342d43c07cb5f3e + plugins: + - "n2y.plugins.mermaid" + - "n2y.plugins.rawcodeblocks" + - "n2y.plugins.removecallouts" + - "n2y.plugins.deepheaders" + - "n2y.plugins.expandlinktopages" + - "plugins.page" + - "plugins.idmentions" + notion_filter: + property: "Tags" + multi_select: { "contains": "DHF" } + - output: "documents/510k" + id: e24f839e724848d69342d43c07cb5f3e + filename_property: "Name" + node_type: "database_as_files" + plugins: + - "n2y.plugins.mermaid" + - "n2y.plugins.rawcodeblocks" + - "n2y.plugins.removecallouts" + - "n2y.plugins.deepheaders" + - "n2y.plugins.expandlinktopages" + - "plugins.page" + - "plugins.idmentions" + notion_filter: + property: "Tags" + multi_select: { "contains": "510(k)" } + - output: "data/Roles.yml" + id: b47a694953714222810152736d9dc66c + node_type: "database_as_yaml" + content_property: "Description" + - output: "data/Glossary.yml" + id: df6bef74e2372118becd93e321de2c69 + node_type: "database_as_yaml" +``` + ## Plugins At the core of n2y are a set of python classes that represent the various parts of a Notion workspace: From cdfcda5b23f3d1d8685931672ac94bb460936ff1 Mon Sep 17 00:00:00 2001 From: "J. David Giese" Date: Tue, 4 Oct 2022 14:16:13 -0400 Subject: [PATCH 14/14] Fix bug introduced in refactor --- n2y/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/n2y/config.py b/n2y/config.py index afcbc4c1..1079de87 100644 --- a/n2y/config.py +++ b/n2y/config.py @@ -61,6 +61,7 @@ def _load_config_from_yaml(path): if not validate_config(config): logger.error("Invalid config file: %s", path) return None + return config def merge_config(config_items, builtin_defaults, defaults):

Column 2