Skip to content

Commit

Permalink
Add runner for morph.io
Browse files Browse the repository at this point in the history
  • Loading branch information
lhm committed Dec 10, 2019
1 parent fe7eae6 commit 0943b6f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -3,5 +3,6 @@ __pycache__/
.venv
*.json
*.jl
*.sqlite
.scrapy

1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -18,6 +18,7 @@ PyHamcrest==1.9.0
pyOpenSSL==19.0.0
queuelib==1.5.0
Scrapy==1.7.3
scraperwiki==0.5.1
service-identity==18.1.0
six==1.12.0
Twisted==19.7.0
Expand Down
37 changes: 37 additions & 0 deletions scraper.py
@@ -0,0 +1,37 @@
import os
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'
import scraperwiki

import scrapy
import scrapy.signals
from scrapy.crawler import CrawlerProcess
from allris.spiders.meetings import MeetingsSpider

def item_scraped(item, response, spider):
record = dict((k, item[k]) for k in ('id', 'name', 'start', 'end', 'web'))
scraperwiki.sql.save(['id'], record)

def spider_closed(spider, reason):
# close pending transactions
scraperwiki.sql.commit_transactions()
# save the last url scraped
last_url = getattr(spider, 'last_url', '')
scraperwiki.sql.save_var('last_url', last_url)

def get_last_url():
last_url = scraperwiki.sql.get_var('last_url')
if last_url is None:
last_url = 'https://ratsinfo.leipzig.de/bi/oparl/1.0/meetings.asp?body=2387&p=100'
return last_url

def setup():
settings = { 'HTTPCACHE_ENABLED': True, 'LOG_LEVEL': 'INFO'}
process = CrawlerProcess(settings)
process.crawl(MeetingsSpider, start_url=get_last_url())
for p in process.crawlers:
p.signals.connect(item_scraped, signal=scrapy.signals.item_scraped)
p.signals.connect(spider_closed, signal=scrapy.signals.spider_closed)
return process

runner = setup()
runner.start()

0 comments on commit 0943b6f

Please sign in to comment.