Skip to content

Commit

Permalink
Fixes for changed schema.org HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
cygri committed Dec 6, 2011
1 parent cbb04cf commit 9a890f0
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
2 changes: 1 addition & 1 deletion scrapers/run.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ def check_lines(file, expected)
output = `wc -l #{file}`
abort "#{file} was not created. Aborting!" if !output
abort "wc -l failed on #{file}. Aborting!" unless m = /\d+/.match(output)
abort "#{file} has less than #{expected} lines (#{m[0]}. Aborting!" if m[0].to_i < expected
abort "#{file} has less than #{expected} lines (#{m[0]}). Aborting!" if m[0].to_i < expected
end

abort "Usage: run.rb tmp_dir target_dir" if ARGV.length != 2
Expand Down
10 changes: 7 additions & 3 deletions scrapers/schema_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ def parse(url):
def get_all_type_ids():
root = parse(full_docs_url)
types = []
for a in root.cssselect("a[name]"):
types.append(a.getnext().text_content())
for a in root.cssselect("table.h a[href]"):
id = a.text_content()
if id[-1] == '*': continue
types.append(id)
return types

def get_inner_html(el):
Expand Down Expand Up @@ -58,6 +60,8 @@ def get_type_details(url):
el = el.getnext()
if type['comment'] == None:
print >> sys.stderr, 'WARNING: No comment in type ' + id
type['comment'] = type['comment'].strip()
type['comment_plain'] = type['comment_plain'].strip()
type['instances'] = []
type['subtypes'] = []
for section in root.cssselect("h3"):
Expand Down Expand Up @@ -89,7 +93,7 @@ def get_type_details(url):
'id': name,
'label': get_label(name),
'domains': [id],
'ranges': row.cssselect("td.prop-ect")[0].text_content().split(' or '),
'ranges': re.sub('\s+', ' ', row.cssselect("td.prop-ect")[0].text_content()).strip().split(' or '),
'comment': get_inner_html(comment),
'comment_plain': comment.text_content()
})
Expand Down

0 comments on commit 9a890f0

Please sign in to comment.