From 1dc8042d41bfd06e13d0fee4974c61780c612023 Mon Sep 17 00:00:00 2001 From: Luca Fabbri Date: Mon, 30 Dec 2013 20:20:41 +0100 Subject: [PATCH] --directory option now support markers --- README.rst | 28 +++++++++++---- docs/HISTORY.txt | 2 +- src/allanon/main.py | 19 ++++++++--- src/allanon/resouce_grabber.py | 38 +++++++++++++-------- src/allanon/tests/acceptance_tests.py | 19 +++++++++++ src/allanon/tests/resource_grabber_tests.py | 7 +++- 6 files changed, 84 insertions(+), 29 deletions(-) diff --git a/README.rst b/README.rst index 7989e5a..6ecba4e 100644 --- a/README.rst +++ b/README.rst @@ -123,20 +123,22 @@ Filters are applied in the given order, so: Potentially you can continue this way, providing a third level of filters, and so on. -Naming downloaded resources ---------------------------- +Naming and storing downloaded resources +--------------------------------------- -Allanon download all files in the current directory (or in the directory specified using the -``--directory`` option) so a filename conflict is possible. - -Instead of downloading resources "as is", you can change dynamically the filename using the -``--filename`` option. +Allanon download all files in the current directory so a filename conflict is possible. +You can control how/where download changing dynamically the filename using the +``--filename`` option and/or change the directory where to store files with the +``--directory`` option. An example:: $ allanon --filename="%HOST-%INDEX-section%1-version%3-%FULLNAME" \ > "http://foo.org/pdf-repo-{1:10}/file{1:50}.pdf?version={0:3}" +As you seen ``--filename`` accept some *markers* that can be used to better organize +resources: + ``%HOST`` Will be replaced with the hostname used in the URL. ``%INDEX`` @@ -153,6 +155,18 @@ An example:: You can also use the ``%NAME`` and ``%EXTENSION`` to get only the name of the file (without extension) or simply the extension. +The ``--directory`` option can be a simple directory name or a directory path (in unix-like +format, for example "``foo/bar/baz``"). + +An example:: + + $ allanon --directory="/home/keul/%HOST/%1" \ + > "http://foo.org/pdf-repo-{1:10}/file{1:50}.pdf" \ + > "http://baz.net/pdf-repo-{1:10}/file{1:50}.pdf" + +Also the ``--directory`` option supports some of the markers: you can use ``%HOST``, ``%INDEX`` and ``%X`` +with the same meaning given above. + TODO ==== diff --git a/docs/HISTORY.txt b/docs/HISTORY.txt index d919e7b..c450924 100644 --- a/docs/HISTORY.txt +++ b/docs/HISTORY.txt @@ -10,7 +10,7 @@ Changelog - Application specific user agent header (configurable through ``--user-agent`` option) - The ``--directory`` option can be a path and so create - intermediate directories + intermediate directories, and accept markers 0.1 (2013-01-05) ---------------- diff --git a/src/allanon/main.py b/src/allanon/main.py index 94aa356..2dc0baf 100644 --- a/src/allanon/main.py +++ b/src/allanon/main.py @@ -53,11 +53,19 @@ help="Directory where to store all resources that will be downloaded.\n" "Default is the current directory.\n" "Can be also a directory path string in nix format (like \"foo/bar\"), " - "in that case all intermediate directories will be created.") + "in that case all intermediate directories will be created.\n" + "You can use some markers for creating a dynamic name.\n" + "Use %x (%1, %2, ...) to include the current URLs range " + "(if any). Use %1 for the first range in the URL, %2 for " + "the second, and so on.\n" + "Use %HOST for include the original host where the resource has " + "been downloaded.\n" + "Use %INDEX for include a progressive number of downloaded resources.\n" + ) parser.add_option('--filename', '-f', dest="filename_model", default=None, metavar="FILENAME", help="Download resources with a custom, dynamic, filename.\n" - "You can use some marker for creating a dynamic content.\n" - "Use %x (%1, %2, ...) for include the current URLs range " + "You can use some markers for creating a dynamic name.\n" + "Use %x (%1, %2, ...) to include the current URLs range " "(if any). Use %1 for the first range in the URL, %2 for " "the second, and so on.\n" "Use %HOST for include the original host where the resource has " @@ -65,7 +73,7 @@ "Use %INDEX for include a progressive number of downloaded resources.\n" "Use %NAME for include the original filename (without extension).\n" "Use %EXTENSION for include the original file extensions.\n" - "Use %FULLNAME for include the original filename (with extension)\n" + "Use %FULLNAME for include the original filename (with extension).\n" "Default is \"%FULLNAME\"") parser.add_option("--check-duplicate", action="store_true", dest="duplicate_check", default=False, help="When finding a duplicate filename check they are duplicates. " @@ -73,7 +81,7 @@ "resources handling filename collision, without checking files content.") parser.add_option('--user-agent', dest="user_agent", default=None, metavar="USER_AGENT", help="Change the User-Agent header sent with every request.\n" - "Default is \"Allanon Crawler \".") + "Default is \"Allanon Crawler %s\"." % VERSION) def main(options=None, *args): @@ -88,6 +96,7 @@ def main(options=None, *args): result.append(parser.get_usage()) result.append(DESCRIPTION+"\n") result.append(parser.format_option_help(parser.formatter)) + result.append('By Luca Fabbri - lucakeul.it\n') result.append('See https://github.com/keul/Allanon for detailed documentation or ' 'provide bug report.') print "\n".join(result) diff --git a/src/allanon/resouce_grabber.py b/src/allanon/resouce_grabber.py index 5936fd3..5176556 100644 --- a/src/allanon/resouce_grabber.py +++ b/src/allanon/resouce_grabber.py @@ -93,25 +93,29 @@ def _get_filename(self, filename_model=None, ids=[], index=0, ids_digit_len=ids_digit_len, index_digit_len=index_digit_len) return filename - - def _generate_filename_from_model(self, original, filename_model, ids=[], index=0, - ids_digit_len=[], index_digit_len=0): - filename = filename_model + + def _string_interpolation(self, model, ids=[], index=0, + ids_digit_len=[], index_digit_len=0): # replace %x with proper ids cnt = 0 - while dynaid_re.search(filename): - match = dynaid_re.search(filename) + while dynaid_re.search(model): + match = dynaid_re.search(model) dynaid = match.group() - filename = filename.replace(dynaid, - _int_format(ids[cnt], - ids_digit_len[cnt]), 1) + model = model.replace(dynaid, _int_format(ids[cnt], + ids_digit_len[cnt]), 1) cnt+=1 # replace %INDEX with the progressive - if filename.find("%INDEX")>-1: - filename = filename.replace("%INDEX", _int_format(index, index_digit_len)) + if model.find("%INDEX")>-1: + model = model.replace("%INDEX", _int_format(index, index_digit_len)) # replace %HOST with current host - if filename.find("%HOST")>-1: - filename = filename.replace("%HOST", self.url_info.hostname) + if model.find("%HOST")>-1: + model = model.replace("%HOST", self.url_info.hostname) + return model + + def _generate_filename_from_model(self, original, filename_model, ids=[], index=0, + ids_digit_len=[], index_digit_len=0): + filename = self._string_interpolation(filename_model, ids, index, ids_digit_len, index_digit_len) + # *** Other interpolation (only file specific) *** # replace %NAME with original filename if filename.find("%NAME")>-1: filename = filename.replace("%NAME", original[:original.rfind('.')]) @@ -123,11 +127,13 @@ def _generate_filename_from_model(self, original, filename_model, ids=[], index= filename = filename.replace("%FULLNAME", original) return filename - def _create_subdirs(self, directory): + def _create_subdirs(self, directory, ids=[], index=0, + ids_digit_len=[], index_digit_len=0): """Given a directory name, or a directory path string in nix format (e.g: foo/bar), create all intermediate directories. Return the new (existing) final directory absolute path """ + directory = self._string_interpolation(directory, ids, index, ids_digit_len, index_digit_len) if not os.path.exists(directory): os.makedirs(directory) return directory @@ -136,10 +142,12 @@ def download(self, directory, filename_model=None, ids=[], index=0, ids_digit_len=[], index_digit_len=0, duplicate_check=False): """Download a remote resource. Return the new path or None if no resource has been created""" self._open() + directory = self._create_subdirs(directory, ids=ids, index=index, + ids_digit_len=ids_digit_len, + index_digit_len=index_digit_len) filename = self._get_filename(filename_model=filename_model, ids=ids, index=index, ids_digit_len=ids_digit_len, index_digit_len=index_digit_len) - directory = self._create_subdirs(directory) path = os.path.join(directory, filename) if duplicate_check and os.path.exists(path): # Before trying to find a free filename, check is this file is a duplicate diff --git a/src/allanon/tests/acceptance_tests.py b/src/allanon/tests/acceptance_tests.py index ed50c18..0c131fc 100644 --- a/src/allanon/tests/acceptance_tests.py +++ b/src/allanon/tests/acceptance_tests.py @@ -80,6 +80,25 @@ def dynamic_url_downloads_test(self): self.assertTrue(self._same_content('3-foo2.pdf', 'text2.txt')) self.assertTrue(self._same_content('4-foo2.pdf', 'text2.txt')) + def dynamic_directory_generation_test(self): + self.options.destination_directory = os.path.join(self.temp_dir, "%HOST/series-%1") + HTTPretty.register_uri(HTTPretty.GET, "http://foo.org/bar-1/file.pdf", + body=self._read_file('text1.txt')) + HTTPretty.register_uri(HTTPretty.GET, "http://foo.org/bar-2/file.pdf", + body=self._read_file('text1.txt')) + HTTPretty.register_uri(HTTPretty.GET, "http://baz.net/bar-1/file.pdf", + body=self._read_file('text1.txt')) + HTTPretty.register_uri(HTTPretty.GET, "http://baz.net/bar-2/file.pdf", + body=self._read_file('text1.txt')) + main(self.options, 'http://foo.org/bar-{1:2}/file.pdf', 'http://baz.net/bar-{1:2}/file.pdf') + self.assertEqual(self._get_downloaded_files(), ['baz.net', 'foo.org']) + self.assertEqual(listdir(os.path.join(self.temp_dir, 'baz.net')), ['series-1', 'series-2']) + self.assertEqual(listdir(os.path.join(self.temp_dir, 'foo.org')), ['series-1', 'series-2']) + self.assertTrue(self._same_content('foo.org/series-1/file.pdf', 'text1.txt')) + self.assertTrue(self._same_content('foo.org/series-2/file.pdf', 'text1.txt')) + self.assertTrue(self._same_content('baz.net/series-1/file.pdf', 'text1.txt')) + self.assertTrue(self._same_content('baz.net/series-2/file.pdf', 'text1.txt')) + # Step 2: really useful features def inner_resources_download_test(self): # main command line URLs set diff --git a/src/allanon/tests/resource_grabber_tests.py b/src/allanon/tests/resource_grabber_tests.py index d84adf3..9596a7a 100644 --- a/src/allanon/tests/resource_grabber_tests.py +++ b/src/allanon/tests/resource_grabber_tests.py @@ -1,7 +1,6 @@ # -*- coding: utf8 -*- from tempfile import mkdtemp -import os import os.path import unittest @@ -33,6 +32,12 @@ def test_create_multiple_dirs_some_exists(self): self.assertEqual(rg._create_subdirs(os.path.join(self.directory, 'foo/bar/baz')), os.path.join(self.directory, 'foo', 'bar', 'baz')) + def test_create_dirs_with_interpolation(self): + rg = ResourceGrabber('http://foo.com/part{1:4}/section-{10:20}/foo.pdf') + result = rg._create_subdirs(os.path.join(self.directory, '%HOST/section-%1/%2'), + ids=[2, 3], index=2, + ids_digit_len=[1, 1], index_digit_len=1) + self.assertEqual(result, os.path.join(self.directory, 'foo.com', 'section-2', '3')) class ResourceGrabberDirectDownloadTest(unittest.TestCase):