Skip to content
This repository has been archived by the owner on Dec 4, 2023. It is now read-only.

Commit

Permalink
Merge pull request #51 from benjwadams/waf_link_fixes
Browse files Browse the repository at this point in the history
WAF link fixes
  • Loading branch information
benjwadams committed Aug 3, 2018
2 parents 9d942a9 + a39883c commit 6d7780c
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 5 deletions.
25 changes: 20 additions & 5 deletions catalog_harvesting/erddap_waf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from catalog_harvesting.waf_parser import WAFParser
from bs4 import BeautifulSoup
import six
from distutils.version import LooseVersion


class ERDDAPWAFParser(WAFParser):
Expand All @@ -13,10 +15,23 @@ def get_links(self, content):
'''
Returns a list of tuples href, text for each anchor in the document
'''
retval = []
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find('pre').find_all('a'):
if link.text.endswith('.xml'):
retval.append((link.get('href'), link.text))
return retval
raw_ver = soup.find(text=lambda t: 'ERDDAP, Version ' in t)
# could likely equivalently check for None here
if not isinstance(raw_ver, six.string_types):
ver_full = None
else:
try:
ver_full = LooseVersion(raw_ver.strip().rsplit()[-1])
except:
# TODO: add warnings
ver_full = None

if ver_full is None or ver_full < LooseVersion('1.82'):
# if the ERDDAP version is less than 1.82, the attributes are stored in a <pre>
link_container = soup.find('pre')
else:
link_container = soup.find('div', {'class': 'standard_width'}).find('table')

return [(link.get('href'), link.text) for link in
link_container.find_all('a', text=lambda t: t.endswith('.xml'))]
3 changes: 3 additions & 0 deletions catalog_harvesting/waf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def _parse(self, url, documents, depth, maxdepth):
links = self.get_links(response.content)
follow = []
for link, text in links:
# Some links might not have href. Skip them.
if link is None:
continue
# Deal with relative links
if link.startswith('..'):
continue
Expand Down

0 comments on commit 6d7780c

Please sign in to comment.