/
scraper.py
48 lines (44 loc) · 2.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import scraperwiki
import urlparse
import lxml.html
# scrape_table function: gets passed an individual page to scrape
def scrape_table(parameter_a):
rows = parameter_a.cssselect("li.search-result") # selects all <li> blocks within <ul class="results-list list-unstyled"> and puts in list variable 'rows'
# My guess is it knows to make a list variable because either 'cssselect' function has that written in or it does it automatically because there are multiple table rows
print rows
for row in rows:
# Set up our data record - we'll need it later
record = {}
table_cells = row.cssselect("h3")
table_cells2 = row.cssselect("ul.search-result__provider-info li")
if table_cells:
record['School name'] = table_cells[0].text_content()
if table_cells2:
record['School URN'] = table_cells2[0].text_content().split(":")[-1].strip()
# Print out the data we've gathered
print record, '------------'
# Finally, save the record to the datastore - 'School name' is our unique key
scraperwiki.sqlite.save(["School name"], record)
# scrape_and_look_for_next_link function: calls the scrape_table
# function, then hunts for a 'next' link: if one is found, calls itself again
def scrape_and_look_for_next_link(parameter_b):
html = scraperwiki.scrape(parameter_b)
# print html
root = lxml.html.fromstring(html)
scrape_table(root)
# next_link = root.cssselect("a.pagination__next")
# print next_link
# if next_link:
# next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
# print next_url
# scrape_and_look_for_next_link(next_url)
# ---------------------------------------------------------------------------
# START HERE: define your starting URL - then
# call a function to scrape the first page in the series.
# ---------------------------------------------------------------------------
starting_url = 'https://reports.ofsted.gov.uk/search?q=&location=&radius=&level_2_types%5B0%5D=2®ion%5B0%5D=E12000007&status%5B0%5D=1&level_1_types=1&start='
i = 0
while i < 800:
url = starting_url + str(i) + '&rows=100'
scrape_and_look_for_next_link(url)
i += 100