Skip to content
This repository has been archived by the owner on Dec 4, 2023. It is now read-only.

WAF link fixes #51

Merged
merged 3 commits into from
Aug 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions catalog_harvesting/erddap_waf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from catalog_harvesting.waf_parser import WAFParser
from bs4 import BeautifulSoup
import six
from distutils.version import LooseVersion


class ERDDAPWAFParser(WAFParser):
Expand All @@ -13,10 +15,23 @@ def get_links(self, content):
'''
Returns a list of tuples href, text for each anchor in the document
'''
retval = []
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find('pre').find_all('a'):
if link.text.endswith('.xml'):
retval.append((link.get('href'), link.text))
return retval
raw_ver = soup.find(text=lambda t: 'ERDDAP, Version ' in t)
# could likely equivalently check for None here
if not isinstance(raw_ver, six.string_types):
ver_full = None
else:
try:
ver_full = LooseVersion(raw_ver.strip().rsplit()[-1])
except:
# TODO: add warnings
ver_full = None

if ver_full is None or ver_full < LooseVersion('1.82'):
# if the ERDDAP version is less than 1.82, the attributes are stored in a <pre>
link_container = soup.find('pre')
else:
link_container = soup.find('div', {'class': 'standard_width'}).find('table')

return [(link.get('href'), link.text) for link in
link_container.find_all('a', text=lambda t: t.endswith('.xml'))]
3 changes: 3 additions & 0 deletions catalog_harvesting/waf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def _parse(self, url, documents, depth, maxdepth):
links = self.get_links(response.content)
follow = []
for link, text in links:
# Some links might not have href. Skip them.
if link is None:
continue
# Deal with relative links
if link.startswith('..'):
continue
Expand Down