-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
247 lines (186 loc) · 7.84 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
# for other data https://classic.scraperwiki.com/docs/python/python_datastore_guide/
import scraperwiki
import lxml.html
import hashlib
import re
from splinter import Browser
import sys, traceback, logging, shutil, platform
from string import ascii_lowercase
dev_mode = False;
# This us used to create a version of the tag content with the awful pipe subfield indicator substitution stripped out
subfield_indicator_regex = re.compile(ur"\|.",re.UNICODE)
# subfield_indicator_regex = re.compile(r"(\\|b)")
# print 'testing regex', subfield_indicator_regex.sub('XXX','a |b c |s d |d')
marc_extract_rules = {
'245' : {
'targetColumn' : 'Title',
'allowRepeated' : False
}
}
def select_full_holidings_and_marc_tags(browser):
print 'selecting full holdings and marc tags'
browser.find_by_name('VOPTIONS').first.click()
if not browser.is_element_present_by_name('SCROLL^S', wait_time=30):
raise Exception("Failed to find SCROLL^S")
browser.select("vopt_unformatted", "Y")
browser.select("vopt_elst", "ALL")
if dev_mode :
brower.screenshot('screen_0004.png')
browser.find_by_name('SCROLL^S').first.click()
if browser.is_element_present_by_name("VOPTIONS", wait_time=15):
print 'Got full details page'
print 'done'
return
def scrape_item_info(browser, resource_properties):
print 'Getting item info'
# bibinfo_div = driver.find_element_by_name('bibinfo') - It's class not name
# title = bibinfo_div.find_element_by
return
def scrape_catalog_info(browser, resource_properties):
print 'Getting catalog info'
browser.find_by_id('tab3').first.click()
# Do we need to wait for the content?
print 'Get marc_data table'
raw_data_table = browser.find_by_xpath(
"//a[@name='marc_data']/following-sibling::table"
)
trs = raw_data_table.find_by_tag("tr")
for row in trs:
marc_tag = row.find_by_xpath("./th")
indicators = row.find_by_xpath("./td[1]")
tag_content = row.find_by_xpath("./td[2]")
# print 'Handling content %s' % tag_content.text
inner_anchor = tag_content.find_by_xpath("./a")
v = None
if len(inner_anchor) == 1 :
# v = inner_anchor[0].text.decode('windows-1252','replace')
# v = unicode(inner_anchor[0].text, 'windows-1252', 'ignore')
v = inner_anchor[0].text
else:
v = tag_content.text
# decoded_v = unicode(v,'latin-1')
decoded_v = v
# print "Got tag %s indicators %s value %s" % ( marc_tag.text, indicators.text, decoded_v )
action = marc_extract_rules.get(marc_tag.text)
if action is not None :
# print 'Processing', marc_tag.text, 'as ', action['targetColumn'], 'Set to', decoded_v
# iBistro says it's sending us UTF8 in the header, but then nicely passes windows-1252. Attempt to work
# around by calling decode.
resource_properties[action['targetColumn']] = subfield_indicator_regex.sub('',decoded_v);
return
def scrape_resource_page(browser) :
print 'scraping a resource'
resource_properties = {}
scrape_item_info(browser, resource_properties)
scrape_catalog_info(browser, resource_properties)
# Make a key from the title [And some other fields to make the md5 unique]
if resource_properties.get('Title') is not None :
m = hashlib.md5()
# This only works on ascii characters -
m.update(resource_properties.get('Title').encode('ascii','ignore'))
resource_properties['hashCode'] = m.hexdigest()
else :
print 'Non title - cant md5 it'
return resource_properties
def report_module(name):
inf = sys.modules[name]
if hasattr( inf, '__version__' ):
print name, inf.__version__
else:
print "No version info for ", name
def scrape_ibistro() :
try:
print "platform %s" % platform.system()
print "Python ", sys.version_info
report_module('splinter');
report_module('scraperwiki');
with Browser('phantomjs') as browser:
for a in ascii_lowercase :
scrape_a_letter(browser,''+a)
for b in ascii_lowercase :
scrape_a_letter(browser,''+a+b)
for c in ascii_lowercase :
scrape_a_letter(browser,''+a+b+c)
for d in ascii_lowercase :
scrape_a_letter(browser,''+a+b+c+d)
scraperwiki.sqlite.save_var('completed_prefix', ''+a+b+c+d)
except:
print "Unexpected error:", sys.exc_info()
# logging.exception("Error")
# traceback.print_exc()
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_tb(exc_traceback)
# Eek this is __dirty__ - copy the ghostdriver.log to stdout so it appears on the morph.io screen for easy [easier] debugging
if dev_mode is True:
with open("ghostdriver.log", "r") as f:
shutil.copyfileobj(f, sys.stdout)
return
def scrape_a_letter(browser,letter) :
print 'scraping', letter
browser.visit('http://library.sheffield.gov.uk/uhtbin/webcat')
print 'starting'
print "Looking for power search button"
power_search_button = browser.find_by_xpath('//a[contains(text(),"Power Search")]').first
power_search_button.click()
browser.select("match_on", "PARTIAL")
# match_on partial adds the wildcard for us [And never tells us] and terminates the string at that point *bangs head*
browser.fill("searchdata3", letter)
# Send enter to cause the search to execute
button = browser.find_by_xpath(
'//input[@class="searchbutton" and @value="Search"]'
).first
button.click()
print 'Waiting for first item in results page to appear'
try :
# Wait for the search results page to finish loading
if not browser.is_element_present_by_id('VIEW1', wait_time=100):
raise Exception('Failed to find VIEW1')
# Debugging
if dev_mode:
browser.save_screenshot('screen_0002.png') # save a screenshot to disk
print 'Clicking button with name VIEW^1'
# Now click the details button for search result 1
browser.find_by_name('VIEW^1').first.click()
print 'Waiting for details page to finish loading'
# Wait for the details page to finish loading
if browser.is_element_present_by_name('VOPTIONS', wait_time=60):
print 'Got full details page'
print 'got form_type input control.. good to continue'
if dev_mode:
browser.save_screenshot('screen_0003.png') # save a screenshot to disk
select_full_holidings_and_marc_tags(browser)
if dev_mode:
browser.save_screenshot('screen_0005.png') # save a screenshot to disk
data = scrape_resource_page(browser)
try:
while browser.is_element_present_by_name('SCROLL^F', wait_time=60):
if data is not None :
scraperwiki.sqlite.save(unique_keys=['hashCode'], data=data)
print 'Processing data = ', data
print 'Moving to next record'
else :
print("** NO DATA **");
next_link = browser.find_by_name('SCROLL^F');
next_link.click()
data = scrape_resource_page(browser)
except:
print "Exception looking for NEXT - looks like we reached the end of the results - on to next nonce"
except:
print "Possible the search returned no results.. continue"
print("Looks like we reached the end of the next page links...");
return
print 'DoIt'
scrape_ibistro();
print 'DoneIt'