Skip to content

Commit

Permalink
added the --check-duplicate option
Browse files Browse the repository at this point in the history
  • Loading branch information
keul committed Dec 29, 2013
1 parent d12391b commit 506718a
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 9 deletions.
6 changes: 4 additions & 2 deletions docs/HISTORY.txt
Expand Up @@ -5,8 +5,10 @@ Changelog
----------------

- Do not crawl or download when on error pages
- Handle duplicate filename when downloading resources
- Application specific user agent header (configurable)
- Handle duplicate filename when downloading resources:
added the ``--check-duplicate`` option
- Application specific user agent header (configurable
through ``--user-agent`` option)

0.1 (2013-01-05)
----------------
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -4,8 +4,8 @@
setup(name='Allanon',
# scripts=['src/allanon',],
version=open(os.path.join("src", "allanon", "version.txt")).read(),
description="A crawler for visit a predictable set of URLs, "
"and download resources from them",
description="A Web crawler that visit a predictable set of URLs, "
"and automatically download resources you want from them",
long_description=open("README.rst").read() + "\n" +
open(os.path.join("docs", "HISTORY.txt")).read(),
classifiers=["Development Status :: 3 - Alpha",
Expand Down
9 changes: 7 additions & 2 deletions src/allanon/main.py
Expand Up @@ -51,7 +51,7 @@
parser.add_option('--directory', '-d', dest="destination_directory", default=os.getcwd(),
metavar="TARGET_DIR",
help="Directory where to store all resources that will be downloaded.\n"
"Default if the current directory")
"Default is the current directory")
parser.add_option('--filename', '-f', dest="filename_model", default=None, metavar="FILENAME",
help="Download resources with a custom, dynamic, filename.\n"
"You can use some marker for creating a dynamic content.\n"
Expand All @@ -65,6 +65,10 @@
"Use %EXTENSION for include the original file extensions.\n"
"Use %FULLNAME for include the original filename (with extension)\n"
"Default is \"%FULLNAME\"")
parser.add_option("--check-duplicate", action="store_true", dest="duplicate_check", default=False,
help="When finding a duplicate filename, check if both files are duplicate. "
"In this case, do not save the second file. Default action is to keep all "
"resources handling filename collision.")
parser.add_option('--user-agent', dest="user_agent", default=None, metavar="USER_AGENT",
help="Change the User-Agent header sent with every request.\n"
"Default is \"Allanon Crawler <version number>\".")
Expand Down Expand Up @@ -109,7 +113,8 @@ def main(options=None, *args):
rg = ResourceGrabber(url)
rg.download(options.destination_directory, options.filename_model, ids, index+1,
ids_digit_len=max_ids,
index_digit_len=index_digit_len)
index_digit_len=index_digit_len,
duplicate_check=options.duplicate_check)
except KeyboardInterrupt:
print "\nTerminated by user action"
sys.exit(1)
Expand Down
18 changes: 17 additions & 1 deletion src/allanon/resouce_grabber.py
@@ -1,6 +1,8 @@
# -*- coding: utf8 -*-

import re
import hashlib
import tempfile
import os.path
import urllib
from urlparse import urlparse
Expand Down Expand Up @@ -122,12 +124,25 @@ def _generate_filename_from_model(self, original, filename_model, ids=[], index=
return filename

def download(self, directory, filename_model=None, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
ids_digit_len=[], index_digit_len=0, duplicate_check=False):
self._open()
filename = self._get_filename(filename_model=filename_model, ids=ids, index=index,
ids_digit_len=ids_digit_len,
index_digit_len=index_digit_len)
path = os.path.join(directory, filename)
if duplicate_check and os.path.exists(path):
# Before trying to find a free filename, check is this file is a duplicate
with open(path, 'rb') as saved:
md5_saved = hashlib.md5(saved.read()).digest()
with tempfile.TemporaryFile() as tmp:
tmp.write(self.request.content)
tmp.seek(0)
md5_remote = hashlib.md5(tmp.read()).digest()
if md5_saved==md5_remote:
# same file
print "Resource at %s is a duplicate of %s" % (self.url,
path)
return
while os.path.exists(path):
# continue trying until we get a good filename
filename = _try_new_filename(filename)
Expand All @@ -136,6 +151,7 @@ def download(self, directory, filename_model=None, ids=[], index=0,
with open(path, 'wb') as f:
print "Writing resource to %s" % path
f.write(self.request.content)
return path

def download_resources(self, query, directory, filename_model=None, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
Expand Down
2 changes: 2 additions & 0 deletions src/allanon/tests/acceptance_tests.py
Expand Up @@ -20,6 +20,8 @@ def setUp(self):
self.options.search_queries = []
self.temp_dir = mkdtemp()
self.options.destination_directory = self.temp_dir
self.options.user_agent = None
self.options.duplicate_check = False
self.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
HTTPretty.enable()

Expand Down
12 changes: 10 additions & 2 deletions src/allanon/tests/resource_grabber_tests.py
Expand Up @@ -91,12 +91,20 @@ def test_file_exists(self):
f.write('bar')
with open(os.path.join(self.directory, 'foo_1.pdf'), 'wb') as f:
f.write('baz')
rg.download(self.directory)
path = os.path.join(self.directory, 'foo_2.pdf')
self.assertEqual(rg.download(self.directory), path)
self.assertTrue(os.path.exists(path))
with open(path) as tfile:
self.assertEqual(tfile.read(), "foo")

def test_file_exists_no_duplicate(self):
HTTPretty.register_uri(HTTPretty.GET, "http://foo.net/foo.pdf",
body="foo")
rg = ResourceGrabber("http://foo.net/foo.pdf")
with open(os.path.join(self.directory, 'foo.pdf'), 'wb') as f:
f.write('foo')
self.assertEqual(rg.download(self.directory, duplicate_check=True), None)

def test_generate_filename_from_model(self):
HTTPretty.register_uri(HTTPretty.GET, "http://foo.net/foo.pdf")
rg = ResourceGrabber("http://foo.net/foo.pdf")
Expand Down Expand Up @@ -183,7 +191,7 @@ def test_get_internal_links(self):
body=self._read_file('page2.html'))
HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/page3.html",
body=self._read_file('notfound.html'), status=404)
# these are resourced inside pages defined above
# these are resources inside pages defined above
HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/text1.txt",
body=self._read_file('text1.txt'))
HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/text2.txt",
Expand Down

0 comments on commit 506718a

Please sign in to comment.