added the --check-duplicate option

keul · Dec 29, 2013 · 506718a · 506718a
1 parent d12391b
commit 506718a
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 9 deletions.
diff --git a/docs/HISTORY.txt b/docs/HISTORY.txt
@@ -5,8 +5,10 @@ Changelog
 ----------------
 
 - Do not crawl or download when on error pages
-- Handle duplicate filename when downloading resources
-- Application specific user agent header (configurable)
+- Handle duplicate filename when downloading resources:
+  added the ``--check-duplicate`` option
+- Application specific user agent header (configurable
+  through ``--user-agent`` option)
 
 0.1 (2013-01-05)
 ----------------

diff --git a/setup.py b/setup.py
@@ -4,8 +4,8 @@
 setup(name='Allanon',
       # scripts=['src/allanon',],
       version=open(os.path.join("src", "allanon", "version.txt")).read(),
-      description="A crawler for visit a predictable set of URLs, "
-                  "and download resources from them",
+      description="A Web crawler that visit a predictable set of URLs, "
+                  "and automatically download resources you want from them",
       long_description=open("README.rst").read() + "\n" +
                        open(os.path.join("docs", "HISTORY.txt")).read(),
       classifiers=["Development Status :: 3 - Alpha",

diff --git a/src/allanon/main.py b/src/allanon/main.py
@@ -51,7 +51,7 @@
 parser.add_option('--directory', '-d', dest="destination_directory", default=os.getcwd(),
                   metavar="TARGET_DIR",
                   help="Directory where to store all resources that will be downloaded.\n"
-                       "Default if the current directory")
+                       "Default is the current directory")
 parser.add_option('--filename', '-f', dest="filename_model", default=None, metavar="FILENAME",
                   help="Download resources with a custom, dynamic, filename.\n"
                        "You can use some marker for creating a dynamic content.\n"
@@ -65,6 +65,10 @@
                        "Use %EXTENSION for include the original file extensions.\n"
                        "Use %FULLNAME for include the original filename (with extension)\n"
                        "Default is \"%FULLNAME\"")
+parser.add_option("--check-duplicate", action="store_true", dest="duplicate_check", default=False,
+                  help="When finding a duplicate filename, check if both files are duplicate. "
+                       "In this case, do not save the second file. Default action is to keep all "
+                       "resources handling filename collision.")
 parser.add_option('--user-agent', dest="user_agent", default=None, metavar="USER_AGENT",
                   help="Change the User-Agent header sent with every request.\n"
                        "Default is \"Allanon Crawler <version number>\".")
@@ -109,7 +113,8 @@ def main(options=None, *args):
             rg = ResourceGrabber(url)
             rg.download(options.destination_directory, options.filename_model, ids, index+1,
                         ids_digit_len=max_ids,
-                        index_digit_len=index_digit_len)
+                        index_digit_len=index_digit_len,
+                        duplicate_check=options.duplicate_check)
     except KeyboardInterrupt:
         print "\nTerminated by user action"
         sys.exit(1)

diff --git a/src/allanon/resouce_grabber.py b/src/allanon/resouce_grabber.py
@@ -1,6 +1,8 @@
 # -*- coding: utf8 -*-
 
 import re
+import hashlib
+import tempfile
 import os.path
 import urllib
 from urlparse import urlparse
@@ -122,12 +124,25 @@ def _generate_filename_from_model(self, original, filename_model, ids=[], index=
         return filename
 
     def download(self, directory, filename_model=None, ids=[], index=0,
-                 ids_digit_len=[], index_digit_len=0):
+                 ids_digit_len=[], index_digit_len=0, duplicate_check=False):
         self._open()
         filename = self._get_filename(filename_model=filename_model, ids=ids, index=index,
                                       ids_digit_len=ids_digit_len,
                                       index_digit_len=index_digit_len)
         path = os.path.join(directory, filename)
+        if duplicate_check and os.path.exists(path):
+            # Before trying to find a free filename, check is this file is a duplicate
+            with open(path, 'rb') as saved:
+                md5_saved = hashlib.md5(saved.read()).digest()
+            with tempfile.TemporaryFile() as tmp:
+                tmp.write(self.request.content)
+                tmp.seek(0)
+                md5_remote = hashlib.md5(tmp.read()).digest()
+            if md5_saved==md5_remote:
+                # same file
+                print "Resource at %s is a duplicate of %s" % (self.url,
+                                                              path)
+                return
         while os.path.exists(path):
             # continue trying until we get a good filename
             filename = _try_new_filename(filename)
@@ -136,6 +151,7 @@ def download(self, directory, filename_model=None, ids=[], index=0,
             with open(path, 'wb') as f:
                 print "Writing resource to %s" % path
                 f.write(self.request.content)
+            return path
 
     def download_resources(self, query, directory, filename_model=None, ids=[], index=0,
                            ids_digit_len=[], index_digit_len=0):

diff --git a/src/allanon/tests/acceptance_tests.py b/src/allanon/tests/acceptance_tests.py
@@ -20,6 +20,8 @@ def setUp(self):
         self.options.search_queries = []
         self.temp_dir = mkdtemp()
         self.options.destination_directory = self.temp_dir
+        self.options.user_agent = None
+        self.options.duplicate_check = False        
         self.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
         HTTPretty.enable()
 

diff --git a/src/allanon/tests/resource_grabber_tests.py b/src/allanon/tests/resource_grabber_tests.py
@@ -91,12 +91,20 @@ def test_file_exists(self):
             f.write('bar')
         with open(os.path.join(self.directory, 'foo_1.pdf'), 'wb') as f:
             f.write('baz')
-        rg.download(self.directory)
         path = os.path.join(self.directory, 'foo_2.pdf')
+        self.assertEqual(rg.download(self.directory), path)
         self.assertTrue(os.path.exists(path))
         with open(path) as tfile:
             self.assertEqual(tfile.read(), "foo")
 
+    def test_file_exists_no_duplicate(self):
+        HTTPretty.register_uri(HTTPretty.GET, "http://foo.net/foo.pdf",
+                           body="foo")
+        rg = ResourceGrabber("http://foo.net/foo.pdf")
+        with open(os.path.join(self.directory, 'foo.pdf'), 'wb') as f:
+            f.write('foo')
+        self.assertEqual(rg.download(self.directory, duplicate_check=True), None)
+
     def test_generate_filename_from_model(self):
         HTTPretty.register_uri(HTTPretty.GET, "http://foo.net/foo.pdf")
         rg = ResourceGrabber("http://foo.net/foo.pdf")
@@ -183,7 +191,7 @@ def test_get_internal_links(self):
                                body=self._read_file('page2.html'))
         HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/page3.html",
                                body=self._read_file('notfound.html'), status=404)
-        # these are resourced inside pages defined above
+        # these are resources inside pages defined above
         HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/text1.txt",
                                body=self._read_file('text1.txt'))
         HTTPretty.register_uri(HTTPretty.GET, "http://recursive.org/text2.txt",