Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
hanzalot committed Aug 12, 2016
1 parent 20fb67b commit 85410fa
Showing 1 changed file with 13 additions and 17 deletions.
30 changes: 13 additions & 17 deletions scraper.py
Expand Up @@ -26,25 +26,21 @@
from scrapy.crawler import CrawlerProcess
import scraperwiki

class ToiletSpider(scrapy.Spider):
name = "toilet"
class NYTBSSpider(scrapy.Spider):
name = "nytbs"
def start_requests(self):
yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=NTIs')
yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=HK')
yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=KLN')
yield scrapy.Request('http://www.nytimes.com/books/best-sellers/')

def parse(self, response):
tables = response.xpath("//table")
print len(tables)
for table in tables:
rows = table.xpath("tr")
district = rows[0].xpath("td/text()").extract()[0]
for row in rows[2:]:
texts = row.xpath("td/text()")
name = texts[1].extract().replace("*", "")
address = texts[2].extract()
scraperwiki.sqlite.save(unique_keys=[], data={"district": district, "name": name, "address": address})
scraperwiki.sqlite.execute("DROP table data")
headers = response.xpath("//*[@id='subnavigation']/form/div")
print len(headers)
for header in headers:
links = header.xpath("//*/option")
for link in links:
value = link.xpath("//@value").extract()
if value:
print value

process = CrawlerProcess()
process.crawl(ToiletSpider)
process.crawl(NYTBSSpider)
process.start()

0 comments on commit 85410fa

Please sign in to comment.