From 60707b8bee70def90f1165b7b968ede878d5e399 Mon Sep 17 00:00:00 2001 From: Benjamin Adams Date: Thu, 2 Aug 2018 17:24:35 -0400 Subject: [PATCH 1/3] Ignores links with no href attr --- catalog_harvesting/waf_parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/catalog_harvesting/waf_parser.py b/catalog_harvesting/waf_parser.py index 72c8b44..b3e80af 100644 --- a/catalog_harvesting/waf_parser.py +++ b/catalog_harvesting/waf_parser.py @@ -61,6 +61,9 @@ def _parse(self, url, documents, depth, maxdepth): links = self.get_links(response.content) follow = [] for link, text in links: + # Some links might not have href. Skip them. + if link is None: + continue # Deal with relative links if link.startswith('..'): continue From f449f3e2a4adeddd0164bfe059409c161671c91d Mon Sep 17 00:00:00 2001 From: Benjamin Adams Date: Thu, 2 Aug 2018 17:24:52 -0400 Subject: [PATCH 2/3] Conditional handling for ERDDAP WAFs Determines the ERDDAP version running by scraping the HTML, then adds conditional handling to fetch ERDDAP WAF links depending on the version running: version < 1.82: Fetches links from
 element
 version >= 1.82: Fetches links from  inside 
element --- catalog_harvesting/erddap_waf_parser.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/catalog_harvesting/erddap_waf_parser.py b/catalog_harvesting/erddap_waf_parser.py index e7903cf..54640e6 100644 --- a/catalog_harvesting/erddap_waf_parser.py +++ b/catalog_harvesting/erddap_waf_parser.py @@ -5,6 +5,9 @@ from catalog_harvesting.waf_parser import WAFParser from bs4 import BeautifulSoup +import six +import re +from distutils.version import LooseVersion class ERDDAPWAFParser(WAFParser): @@ -15,7 +18,24 @@ def get_links(self, content): ''' retval = [] soup = BeautifulSoup(content, 'html.parser') - for link in soup.find('pre').find_all('a'): + raw_ver = soup.find(text=re.compile('ERDDAP, Version .*$')) + # could likely equivalently check for None here + if not isinstance(raw_ver, six.string_types): + ver_full = None + else: + try: + ver_full = LooseVersion(raw_ver.strip().rsplit()[-1]) + except: + # TODO: add warnings + ver_full = None + + if ver_full is None or ver_full < LooseVersion('1.82'): + # if the ERDDAP version is less than 1.82, the attributes are stored in a
+            link_container = soup.find('pre')
+        else:
+            link_container = soup.find('div', {'class': 'standard_width'}).find('table')
+        
+        for link in link_container.find_all('a'):
             if link.text.endswith('.xml'):
                 retval.append((link.get('href'), link.text))
         return retval

From a39883cd52fe2674ad3de27a335aa77aabee4e08 Mon Sep 17 00:00:00 2001
From: Benjamin Adams 
Date: Fri, 3 Aug 2018 11:55:51 -0400
Subject: [PATCH 3/3] Fixup for error conditions, remove re

Removes re module in favor of using str `in` to detect ERDDAP version.

Adds function to prevent failure if text is None when scanning for text
nodes ending in '.xml'

Fixes a couple indentation issues.
---
 catalog_harvesting/erddap_waf_parser.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/catalog_harvesting/erddap_waf_parser.py b/catalog_harvesting/erddap_waf_parser.py
index 54640e6..ec7b976 100644
--- a/catalog_harvesting/erddap_waf_parser.py
+++ b/catalog_harvesting/erddap_waf_parser.py
@@ -6,7 +6,6 @@
 from catalog_harvesting.waf_parser import WAFParser
 from bs4 import BeautifulSoup
 import six
-import re
 from distutils.version import LooseVersion
 
 
@@ -16,17 +15,16 @@ def get_links(self, content):
         '''
         Returns a list of tuples href, text for each anchor in the document
         '''
-        retval = []
         soup = BeautifulSoup(content, 'html.parser')
-	raw_ver = soup.find(text=re.compile('ERDDAP, Version .*$'))
+        raw_ver = soup.find(text=lambda t: 'ERDDAP, Version ' in t)
         # could likely equivalently check for None here
         if not isinstance(raw_ver, six.string_types):
-           ver_full = None
+            ver_full = None
         else:
-	   try:
-	       ver_full = LooseVersion(raw_ver.strip().rsplit()[-1])
-           except:
-           # TODO: add warnings
+            try:
+               ver_full = LooseVersion(raw_ver.strip().rsplit()[-1])
+            except:
+            # TODO: add warnings
                ver_full = None
 
         if ver_full is None or ver_full < LooseVersion('1.82'):
@@ -34,9 +32,6 @@ def get_links(self, content):
             link_container = soup.find('pre')
         else:
             link_container = soup.find('div', {'class': 'standard_width'}).find('table')
-        
-        for link in link_container.find_all('a'):
-            if link.text.endswith('.xml'):
-                retval.append((link.get('href'), link.text))
-        return retval
 
+        return [(link.get('href'), link.text) for link in
+                link_container.find_all('a', text=lambda t: t.endswith('.xml'))]