Skip to content

Commit 03f19ad

Browse files
authored
ch14 tutorial 2
1 parent 5c29207 commit 03f19ad

File tree

1 file changed

+39
-22
lines changed

1 file changed

+39
-22
lines changed

scraper.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,41 @@
1-
# This is a template for a Python scraper on morph.io (https://morph.io)
2-
# including some code snippets below that you should find helpful
1+
###############################################################################
2+
# START HERE: Tutorial 2: Basic scraping and saving to the data store.
3+
# Follow the actions listed in BLOCK CAPITALS below.
4+
###############################################################################
35

4-
# import scraperwiki
5-
# import lxml.html
6-
#
7-
# # Read in a page
8-
# html = scraperwiki.scrape("http://foo.com")
9-
#
10-
# # Find something on the page using css selectors
11-
# root = lxml.html.fromstring(html)
12-
# root.cssselect("div[align='left']")
13-
#
14-
# # Write out to the sqlite database using scraperwiki library
15-
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
16-
#
17-
# # An arbitrary query against the database
18-
# scraperwiki.sql.select("* from data where 'name'='peter'")
6+
import scraperwiki
7+
html = scraperwiki.scrape('https://inmo.ie/6022')
8+
print "Click on the ...more link to see the whole page"
9+
print html
1910

20-
# You don't have to do things with the ScraperWiki and lxml libraries.
21-
# You can use whatever libraries you want: https://morph.io/documentation/python
22-
# All that matters is that your final data is written to an SQLite database
23-
# called "data.sqlite" in the current working directory which has at least a table
24-
# called "data".
11+
# -----------------------------------------------------------------------------
12+
# 1. Parse the raw HTML to get the interesting bits - the part inside <td> tags.
13+
# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines)
14+
# -- CLICK THE 'RUN' BUTTON BELOW
15+
# Check the 'Console' tab again, and you'll see how we're extracting
16+
# the HTML that was inside <td></td> tags.
17+
# We use lxml, which is a Python library especially for parsing html.
18+
# -----------------------------------------------------------------------------
19+
20+
import lxml.html
21+
root = lxml.html.fromstring(html) # turn our HTML into an lxml object
22+
tds = root.cssselect('td') # get all the <td> tags
23+
for td in tds:
24+
print lxml.html.tostring(td) # the full HTML tag
25+
print td.text # just the text inside the HTML tag
26+
27+
# -----------------------------------------------------------------------------
28+
# 2. Save the data in the ScraperWiki datastore.
29+
# -- UNCOMMENT THE THREE LINES BELOW
30+
# -- CLICK THE 'RUN' BUTTON BELOW
31+
# Check the 'Data' tab - here you'll see the data saved in the ScraperWiki store.
32+
# -----------------------------------------------------------------------------
33+
34+
for td in tds:
35+
record = { "td" : td.text } # column name and value
36+
scraperwiki.sqlite.save(["td"], record) # save the records one by one
37+
38+
# -----------------------------------------------------------------------------
39+
# Go back to the Tutorials page and continue to Tutorial 3 to learn about
40+
# more complex scraping methods.
41+
# -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)