Merge pull request #12 from hsci-r/hs_fetch

Error handling for HS
hsci-r · Dec 22, 2021 · b3a4736 · b3a4736
2 parents 696afa2 + 1f9fbe4
commit b3a4736
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/finnish_media_scrapers/fetch.py b/finnish_media_scrapers/fetch.py
@@ -5,7 +5,7 @@
 """
 
 from pyppeteer.browser import Page
-
+from pyppeteer.errors import NetworkError
 
 async def prepare_session_hs(
         session: Page,
@@ -62,7 +62,12 @@ async def fetch_article_hs(
         str: the HTML of the article
     """
     max_web_driver_wait = 1000 * max_web_driver_wait
-    await session.goto(url, timeout=max_web_driver_wait)
+    try:
+        await session.goto(url, timeout=max_web_driver_wait)
+    except NetworkError as network_exception:
+        raise ValueError(
+            f"The page doesn't exist for {url}."
+        ) from network_exception
     try:
         main_content = await session.waitForXPath("//div[@id='page-main-content']/following-sibling::*", timeout=max_web_driver_wait)
         tag_name = await (await main_content.getProperty('tagName')).jsonValue()

diff --git a/finnish_media_scrapers/scripts/fetch_hs.py b/finnish_media_scrapers/scripts/fetch_hs.py
@@ -57,8 +57,12 @@ async def _amain():
                     url = article['url']
                     file = os.path.join(args.output, str(article['id'])+".html")
                     if not os.path.exists(file):
-                        article = await fetch_article_hs(
+                        try:
+                            article = await fetch_article_hs(
                             session, url, args.max_web_driver_wait)
+                        except Exception as e:
+                            logging.info(e)
+                            continue
                         with open(file, "w") as article_file:
                             article_file.write(
                                 "<!DOCTYPE html><head><meta charset='utf-8'></head>" + article + "</html>")