Skip to content

Commit

Permalink
Merge pull request #12 from hsci-r/hs_fetch
Browse files Browse the repository at this point in the history
Error handling for HS
  • Loading branch information
UMTti committed Dec 22, 2021
2 parents 696afa2 + 1f9fbe4 commit b3a4736
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
9 changes: 7 additions & 2 deletions finnish_media_scrapers/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

from pyppeteer.browser import Page

from pyppeteer.errors import NetworkError

async def prepare_session_hs(
session: Page,
Expand Down Expand Up @@ -62,7 +62,12 @@ async def fetch_article_hs(
str: the HTML of the article
"""
max_web_driver_wait = 1000 * max_web_driver_wait
await session.goto(url, timeout=max_web_driver_wait)
try:
await session.goto(url, timeout=max_web_driver_wait)
except NetworkError as network_exception:
raise ValueError(
f"The page doesn't exist for {url}."
) from network_exception
try:
main_content = await session.waitForXPath("//div[@id='page-main-content']/following-sibling::*", timeout=max_web_driver_wait)
tag_name = await (await main_content.getProperty('tagName')).jsonValue()
Expand Down
6 changes: 5 additions & 1 deletion finnish_media_scrapers/scripts/fetch_hs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,12 @@ async def _amain():
url = article['url']
file = os.path.join(args.output, str(article['id'])+".html")
if not os.path.exists(file):
article = await fetch_article_hs(
try:
article = await fetch_article_hs(
session, url, args.max_web_driver_wait)
except Exception as e:
logging.info(e)
continue
with open(file, "w") as article_file:
article_file.write(
"<!DOCTYPE html><head><meta charset='utf-8'></head>" + article + "</html>")
Expand Down

0 comments on commit b3a4736

Please sign in to comment.