Skip to content

Commit

Permalink
Add Queue.
Browse files Browse the repository at this point in the history
  • Loading branch information
hwms committed Apr 8, 2016
1 parent 4711b99 commit f47e6e3
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
1 change: 1 addition & 0 deletions noscrapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .job import Job
from .queue import Queue
from .selector import Selector
from .selectors import *
from .sitemap import Sitemap
Expand Down
40 changes: 40 additions & 0 deletions noscrapy/queue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import re

DOCUMENT_RE = re.compile(r'.*?\.(doc|docx|pdf|ppt|pptx|odt)$', 2)

class Queue(object):
def __init__(self):
self.jobs = []
self.scraped_urls = {}

def add(self, job):
"""Returns false if page is already scraped."""
if self.can_be_added(job):
self.jobs.append(job)
self._set_url_scraped(job.url)
return True
return False

def can_be_added(self, job):
if self.is_scraped(job.url):
return False
# reject documents
if DOCUMENT_RE.match(job.url):
return False
return True

def get_queue_size(self):
return len(self.jobs)

def is_scraped(self, url):
return url in self.scraped_urls

def _set_url_scraped(self, url):
self.scraped_urls[url] = True

def get_next_job(self):
# TODO: test this
if self.jobs:
return self.jobs.pop(0)
else:
return False
25 changes: 25 additions & 0 deletions noscrapy/tests/test_queue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from noscrapy import Job, Queue

def test_add_jobs():
q = Queue()
job = Job('http://test.lv/', {})
q.add(job)
assert 1 == q.get_queue_size()
assert 'http://test.lv/' == q.jobs[0].url

def test_mark_urls_scraped():
q = Queue()
job = Job('http://test.lv/', {})
q.add(job)
q.get_next_job()
assert 0 == q.get_queue_size()

# try add this job again
q.add(job)
assert 0 == q.get_queue_size()

def test_reject_documents():
q = Queue()
job = Job('http://test.lv/test.doc')
assert not q.add(job)
assert 0 == q.get_queue_size()

0 comments on commit f47e6e3

Please sign in to comment.