Added no-links config option to skip links scraping

leoncvlt · Feb 7, 2021 · 4c4c506 · 4c4c506
1 parent 71fbbb2
commit 4c4c506
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -184,13 +184,20 @@ theme = "dark"
     [pages.d2fa06f244e64f66880bb0491f58223d.fonts]
     title = 'DM Mono' 
 
-  # set up pretty slugs for the other database pages
+  # set up pretty slugs and options for the other database pages
   [pages.54dab6011e604430a21dc477cb8e4e3a]
     slug = "film-gallery"
+
   [pages.2604ce45890645c79f67d92833083fee]
     slug = "books-table"
-  [pages.ae0a85c527824a3a855b7f4d31f4e0fc]
+
+    # don't follow any link on the page, skipping parsing sub-pages linked from this one
+    # useful for large tables where we don't want individual pages for each item
+    no-links = true
+
+  [pages.a28dba2e7a67448da52f2cd2c641407b]
     slug = "random-board"
+    no-links = true
 ```
 
 On top of this, the script can take these optional arguments:

diff --git a/example/example_site.toml b/example/example_site.toml
@@ -90,10 +90,17 @@ page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a
     [pages.d2fa06f244e64f66880bb0491f58223d.fonts]
     body = 'DM Mono' 
 
-  # set up pretty slugs for the other database pages
+  # set up pretty slugs and options for the other database pages
   [pages.54dab6011e604430a21dc477cb8e4e3a]
     slug = "film-gallery"
+
   [pages.2604ce45890645c79f67d92833083fee]
     slug = "books-table"
-  [pages.ae0a85c527824a3a855b7f4d31f4e0fc]
-    slug = "random-board"
+
+    # don't follow any link on the page, skipping parsing sub-pages linked from this one
+    # useful for large tables where we don't want individual pages for each item
+    no-links = true
+
+  [pages.a28dba2e7a67448da52f2cd2c641407b]
+    slug = "random-board"
+    no-links = true
diff --git a/loconotion/notionparser.py b/loconotion/notionparser.py
@@ -576,36 +576,52 @@ def injects_custom_tags(section):
 
         # find sub-pages and clean slugs / links
         sub_pages = []
+        parse_links = not self.get_page_config(url).get("no-links", False)
         for a in soup.find_all('a', href=True):
             sub_page_href = a["href"]
             if sub_page_href.startswith("/"):
                 sub_page_href = "https://www.notion.so" + a["href"]
             if sub_page_href.startswith("https://www.notion.so/"):
-                # if the link is an anchor link,
-                # check if the page hasn't already been parsed
-                if "#" in sub_page_href:
-                    sub_page_href_tokens = sub_page_href.split("#")
-                    sub_page_href = sub_page_href_tokens[0]
-                    a["href"] = "#" + sub_page_href_tokens[-1]
-                    a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
-                    if (
-                            sub_page_href in processed_pages.keys()
-                            or sub_page_href in sub_pages
-                    ):
-                        log.debug(
-                            f"Original page for anchor link {sub_page_href}"
-                            " already parsed / pending parsing, skipping"
+                if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
+                    # if the link is an anchor link,
+                    # check if the page hasn't already been parsed
+                    if "#" in sub_page_href:
+                        sub_page_href_tokens = sub_page_href.split("#")
+                        sub_page_href = sub_page_href_tokens[0]
+                        a["href"] = "#" + sub_page_href_tokens[-1]
+                        a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
+                        if (
+                                sub_page_href in processed_pages.keys()
+                                or sub_page_href in sub_pages
+                        ):
+                            log.debug(
+                                f"Original page for anchor link {sub_page_href}"
+                                " already parsed / pending parsing, skipping"
+                            )
+                            continue
+                    else:
+                        a["href"] = (
+                            self.get_page_slug(sub_page_href)
+                            if sub_page_href != index
+                            else "index.html"
                         )
-                        continue
+                    sub_pages.append(sub_page_href)
+                    log.debug(f"Found link to page {a['href']}")
                 else:
-                    a["href"] = (
-                        self.get_page_slug(sub_page_href)
-                        if sub_page_href != index
-                        else "index.html"
-                    )
-                sub_pages.append(sub_page_href)
-                log.debug(f"Found link to page {a['href']}")
-
+                    # if the page is set not to follow any links, strip the href
+                    # do this only on children of .notion-scroller, we don't want
+                    # to strip the links from the top nav bar
+                    log.debug(f"Stripping link for {a['href']}")
+                    del a["href"]
+                    a.name = "span"
+                    # remove pointer cursor styling on the link and all children
+                    for child in ([a] + a.find_all()):
+                        if (child.has_attr("style")):
+                            style = cssutils.parseStyle(child['style'])
+                            style['cursor'] = "default"
+                            child['style'] = style.cssText
+
+
         # exports the parsed page
         html_str = str(soup)
         html_file = self.get_page_slug(url) if url != index else "index.html"