From 81f881180df6546529a3ad49efed3bea3098d822 Mon Sep 17 00:00:00 2001 From: hicham20201441 <70323150+hicham20201441@users.noreply.github.com> Date: Thu, 27 Aug 2020 13:36:40 +0000 Subject: [PATCH] Update scraper.py --- scraper.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/scraper.py b/scraper.py index a52820f..cc5b493 100644 --- a/scraper.py +++ b/scraper.py @@ -13,25 +13,15 @@ urls=[e.get("href") for e in root.cssselect("a")] nour=set(url) while(len(urls)>0): - if url in urls[0] and urls[0] not in nour: - ur=urls[0] - nour.add(ur) - print("scraping: "+ur) - html1= scraperwiki.scrape(ur) + print("scraping: "+urls[0]) + html1= scraperwiki.scrape(urls[0]) root1 = lxml.html.fromstring(html1) - urls.pop(0) newrls=[e.get("href") for e in root1.cssselect("a")] urls=urls+newrls - print(str(len(newrls))) - try: - if root1.cssselect("div[class='blog-col']"): - scraperwiki.sqlite.save(unique_keys=[ur], data={"link": ur, "blog":root1.cssselect("div[class='blog-col']") }) - print("got a blog!") - else: - print("no article for this link") - pass - except:pass - else:pass + print(str(len(newrls))+" new urls" + scraperwiki.sqlite.save(unique_keys=[urls[0]], data={"link": ur, "body":html1 }) + +