diff --git a/scraper.py b/scraper.py index b5517ad..6159c32 100644 --- a/scraper.py +++ b/scraper.py @@ -26,25 +26,21 @@ from scrapy.crawler import CrawlerProcess import scraperwiki -class ToiletSpider(scrapy.Spider): - name = "toilet" +class NYTBSSpider(scrapy.Spider): + name = "nytbs" def start_requests(self): - yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=NTIs') - yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=HK') - yield scrapy.Request('http://www.fehd.gov.hk/english/pleasant_environment/cleansing/list_of_public_toilets.php?district=KLN') + yield scrapy.Request('http://www.nytimes.com/books/best-sellers/') def parse(self, response): - tables = response.xpath("//table") - print len(tables) - for table in tables: - rows = table.xpath("tr") - district = rows[0].xpath("td/text()").extract()[0] - for row in rows[2:]: - texts = row.xpath("td/text()") - name = texts[1].extract().replace("*", "") - address = texts[2].extract() - scraperwiki.sqlite.save(unique_keys=[], data={"district": district, "name": name, "address": address}) -scraperwiki.sqlite.execute("DROP table data") + headers = response.xpath("//*[@id='subnavigation']/form/div") + print len(headers) + for header in headers: + links = header.xpath("//*/option") + for link in links: + value = link.xpath("//@value").extract() + if value: + print value + process = CrawlerProcess() -process.crawl(ToiletSpider) +process.crawl(NYTBSSpider) process.start()