Skip to content

Commit

Permalink
With data saving - title only at this point and an MD5 to use as uniq…
Browse files Browse the repository at this point in the history
…ue id
  • Loading branch information
Ian Ibbotson committed Aug 16, 2015
1 parent f76e216 commit fa9e534
Showing 1 changed file with 87 additions and 67 deletions.
154 changes: 87 additions & 67 deletions scraper.py
@@ -1,16 +1,6 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
Expand All @@ -24,11 +14,19 @@
# called "data".
import scraperwiki
import lxml.html
import hashlib
from splinter import Browser
import sys, traceback, logging, shutil, platform

dev_mode = False;

marc_extract_rules = {
'245' : {
'targetColumn' : 'Title',
'allowRepeated' : False
}
}


def select_full_holidings_and_marc_tags(browser):
print 'selecting full holdings and marc tags'
Expand All @@ -49,13 +47,13 @@ def select_full_holidings_and_marc_tags(browser):
print 'done'
return

def scrape_item_info(browser):
def scrape_item_info(browser, resource_properties):
print 'Getting item info'
# bibinfo_div = driver.find_element_by_name('bibinfo') - It's class not name
# title = bibinfo_div.find_element_by
return

def scrape_catalog_info(browser):
def scrape_catalog_info(browser, resource_properties):
print 'Getting catalog info'

browser.find_by_id('tab3').first.click()
Expand All @@ -69,25 +67,41 @@ def scrape_catalog_info(browser):
trs = raw_data_table.find_by_tag("tr")
for row in trs:
marc_tag = row.find_by_xpath("./th")
# print "procesing tag %s" % marc_tag.text()
indicators = row.find_by_xpath("./td[1]")
tag_content = row.find_by_xpath("./td[2]")
print 'Handling content %s' % tag_content.text
# print 'Handling content %s' % tag_content.text
inner_anchor = tag_content.find_by_xpath("./a")
v = None
if len(inner_anchor) == 1 :
value = inner_anchor[0].text
v = inner_anchor[0].text
else:
v = tag_content.text

print "Got tag %s indicators %s value %s" % ( marc_tag.text, indicators.text, v )
# print "Got tag %s indicators %s value %s" % ( marc_tag.text, indicators.text, v )

action = marc_extract_rules.get(marc_tag.text)

if action is not None :
print 'Processing', marc_tag.text, 'as ', action['targetColumn'], 'Set to', v
resource_properties[action['targetColumn']] = v

return

def scrape_resource_page(browser) :
print 'scraping a resource'
scrape_item_info(browser)
scrape_catalog_info(browser)
return
resource_properties = {}
scrape_item_info(browser, resource_properties)
scrape_catalog_info(browser, resource_properties)

# Make a key from the title [And some other fields to make the md5 unique]
if resource_properties.get('Title') is not None :
m = hashlib.md5()
m.update(resource_properties.get('Title'))
resource_properties['hashCode'] = m.hexdigest()
else :
print 'Non title - cant md5 it'

return resource_properties

def report_module(name):
inf = sys.modules[name]
Expand Down Expand Up @@ -129,73 +143,79 @@ def scrape_ibistro() :
# Say we want to search by title
browser.select("srchfield1", "TI^TITLE^SERIES^Title Processing^title")

# find the search input text control
# Send the query a$ [Or the ord above]
browser.fill("searchdata1", "a$")
# Send enter to cause the search to execute
button = browser.find_by_xpath(
'//input[@class="searchbutton" and @value="Search"]'
).first
button.click()
scrape_a_letter(browser, 'a')

print 'Waiting for first item in results page to appear'
except:
print "Unexpected error:", sys.exc_info()
# logging.exception("Error")
# traceback.print_exc()
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_tb(exc_traceback)

# Wait for the search results page to finish loading
if not browser.is_element_present_by_id('VIEW1', wait_time=30):
raise Exception('Failed to find VIEW1')
# Eek this is __dirty__ - copy the ghostdriver.log to stdout so it appears on the morph.io screen for easy [easier] debugging
if dev_mode is True:
with open("ghostdriver.log", "r") as f:
shutil.copyfileobj(f, sys.stdout)

# Debugging
if dev_mode:
browser.save_screenshot('screen_0002.png') # save a screenshot to disk
return

print 'Clicking button with name VIEW^1'

# Now click the details button for search result 1
browser.find_by_name('VIEW^1').first.click()
# Currently blows up here due to problem with phantomjs 1.9.0

print 'Waiting for details page to finish loading'
def scrape_a_letter(browser,letter) :
# find the search input text control
# Send the query a$ [Or the ord above]
browser.fill("searchdata1", letter+"$")
# Send enter to cause the search to execute
button = browser.find_by_xpath(
'//input[@class="searchbutton" and @value="Search"]'
).first
button.click()

# Wait for the details page to finish loading
if browser.is_element_present_by_name('VOPTIONS', wait_time=15):
print 'Got full details page'
print 'Waiting for first item in results page to appear'

print 'got form_type input control.. good to continue'
# Wait for the search results page to finish loading
if not browser.is_element_present_by_id('VIEW1', wait_time=30):
raise Exception('Failed to find VIEW1')

if dev_mode:
browser.save_screenshot('screen_0003.png') # save a screenshot to disk
# Debugging
if dev_mode:
browser.save_screenshot('screen_0002.png') # save a screenshot to disk

select_full_holidings_and_marc_tags(browser)
print 'Clicking button with name VIEW^1'

if dev_mode:
browser.save_screenshot('screen_0005.png') # save a screenshot to disk
# Now click the details button for search result 1
browser.find_by_name('VIEW^1').first.click()

scrape_resource_page(browser)
print 'Waiting for details page to finish loading'

while browser.is_element_present_by_name('SCROLL^F', wait_time=15):
print 'Moving to next record'
next_link = browser.find_by_name('SCROLL^F');
next_link.click()
scrape_resource_page(browser)

# Wait for the details page to finish loading
if browser.is_element_present_by_name('VOPTIONS', wait_time=15):
print 'Got full details page'

print 'got form_type input control.. good to continue'

if dev_mode:
browser.save_screenshot('screen_0003.png') # save a screenshot to disk

except:
print "Unexpected error:", sys.exc_info()
# logging.exception("Error")
# traceback.print_exc()
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_tb(exc_traceback)
select_full_holidings_and_marc_tags(browser)

# Eek this is __dirty__ - copy the ghostdriver.log to stdout so it appears on the morph.io screen for easy [easier] debugging
if dev_mode is True:
with open("ghostdriver.log", "r") as f:
shutil.copyfileobj(f, sys.stdout)
if dev_mode:
browser.save_screenshot('screen_0005.png') # save a screenshot to disk

data = scrape_resource_page(browser)

while browser.is_element_present_by_name('SCROLL^F', wait_time=15):
if data is not None :
scraperwiki.sqlite.save(unique_keys=['hashCode'], data=data)
print 'Processing data = ', data
print 'Moving to next record'
else :
print("** NO DATA **");

next_link = browser.find_by_name('SCROLL^F');
next_link.click()
data = scrape_resource_page(browser)

# Extra notes
# https://realpython.com/blog/python/headless-selenium-testing-with-python-and-phantomjs/
return

print 'DoIt'
Expand Down

0 comments on commit fa9e534

Please sign in to comment.