Skip to content

Commit

Permalink
Added no-links config option to skip links scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
leoncvlt committed Feb 7, 2021
1 parent 71fbbb2 commit 4c4c506
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 28 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,20 @@ theme = "dark"
[pages.d2fa06f244e64f66880bb0491f58223d.fonts]
title = 'DM Mono'

# set up pretty slugs for the other database pages
# set up pretty slugs and options for the other database pages
[pages.54dab6011e604430a21dc477cb8e4e3a]
slug = "film-gallery"

[pages.2604ce45890645c79f67d92833083fee]
slug = "books-table"
[pages.ae0a85c527824a3a855b7f4d31f4e0fc]

# don't follow any link on the page, skipping parsing sub-pages linked from this one
# useful for large tables where we don't want individual pages for each item
no-links = true

[pages.a28dba2e7a67448da52f2cd2c641407b]
slug = "random-board"
no-links = true
```

On top of this, the script can take these optional arguments:
Expand Down
13 changes: 10 additions & 3 deletions example/example_site.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,17 @@ page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a
[pages.d2fa06f244e64f66880bb0491f58223d.fonts]
body = 'DM Mono'

# set up pretty slugs for the other database pages
# set up pretty slugs and options for the other database pages
[pages.54dab6011e604430a21dc477cb8e4e3a]
slug = "film-gallery"

[pages.2604ce45890645c79f67d92833083fee]
slug = "books-table"
[pages.ae0a85c527824a3a855b7f4d31f4e0fc]
slug = "random-board"

# don't follow any link on the page, skipping parsing sub-pages linked from this one
# useful for large tables where we don't want individual pages for each item
no-links = true

[pages.a28dba2e7a67448da52f2cd2c641407b]
slug = "random-board"
no-links = true
62 changes: 39 additions & 23 deletions loconotion/notionparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,36 +576,52 @@ def injects_custom_tags(section):

# find sub-pages and clean slugs / links
sub_pages = []
parse_links = not self.get_page_config(url).get("no-links", False)
for a in soup.find_all('a', href=True):
sub_page_href = a["href"]
if sub_page_href.startswith("/"):
sub_page_href = "https://www.notion.so" + a["href"]
if sub_page_href.startswith("https://www.notion.so/"):
# if the link is an anchor link,
# check if the page hasn't already been parsed
if "#" in sub_page_href:
sub_page_href_tokens = sub_page_href.split("#")
sub_page_href = sub_page_href_tokens[0]
a["href"] = "#" + sub_page_href_tokens[-1]
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
if (
sub_page_href in processed_pages.keys()
or sub_page_href in sub_pages
):
log.debug(
f"Original page for anchor link {sub_page_href}"
" already parsed / pending parsing, skipping"
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
# if the link is an anchor link,
# check if the page hasn't already been parsed
if "#" in sub_page_href:
sub_page_href_tokens = sub_page_href.split("#")
sub_page_href = sub_page_href_tokens[0]
a["href"] = "#" + sub_page_href_tokens[-1]
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
if (
sub_page_href in processed_pages.keys()
or sub_page_href in sub_pages
):
log.debug(
f"Original page for anchor link {sub_page_href}"
" already parsed / pending parsing, skipping"
)
continue
else:
a["href"] = (
self.get_page_slug(sub_page_href)
if sub_page_href != index
else "index.html"
)
continue
sub_pages.append(sub_page_href)
log.debug(f"Found link to page {a['href']}")
else:
a["href"] = (
self.get_page_slug(sub_page_href)
if sub_page_href != index
else "index.html"
)
sub_pages.append(sub_page_href)
log.debug(f"Found link to page {a['href']}")

# if the page is set not to follow any links, strip the href
# do this only on children of .notion-scroller, we don't want
# to strip the links from the top nav bar
log.debug(f"Stripping link for {a['href']}")
del a["href"]
a.name = "span"
# remove pointer cursor styling on the link and all children
for child in ([a] + a.find_all()):
if (child.has_attr("style")):
style = cssutils.parseStyle(child['style'])
style['cursor'] = "default"
child['style'] = style.cssText


# exports the parsed page
html_str = str(soup)
html_file = self.get_page_slug(url) if url != index else "index.html"
Expand Down

0 comments on commit 4c4c506

Please sign in to comment.