Skip to content

Commit

Permalink
--directory option now support markers
Browse files Browse the repository at this point in the history
  • Loading branch information
keul committed Dec 30, 2013
1 parent a60b438 commit 1dc8042
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 29 deletions.
28 changes: 21 additions & 7 deletions README.rst
Expand Up @@ -123,20 +123,22 @@ Filters are applied in the given order, so:

Potentially you can continue this way, providing a third level of filters, and so on.

Naming downloaded resources
---------------------------
Naming and storing downloaded resources
---------------------------------------

Allanon download all files in the current directory (or in the directory specified using the
``--directory`` option) so a filename conflict is possible.

Instead of downloading resources "as is", you can change dynamically the filename using the
``--filename`` option.
Allanon download all files in the current directory so a filename conflict is possible.
You can control how/where download changing dynamically the filename using the
``--filename`` option and/or change the directory where to store files with the
``--directory`` option.

An example::

$ allanon --filename="%HOST-%INDEX-section%1-version%3-%FULLNAME" \
> "http://foo.org/pdf-repo-{1:10}/file{1:50}.pdf?version={0:3}"

As you seen ``--filename`` accept some *markers* that can be used to better organize
resources:

``%HOST``
Will be replaced with the hostname used in the URL.
``%INDEX``
Expand All @@ -153,6 +155,18 @@ An example::
You can also use the ``%NAME`` and ``%EXTENSION`` to get only the name of the file
(without extension) or simply the extension.

The ``--directory`` option can be a simple directory name or a directory path (in unix-like
format, for example "``foo/bar/baz``").

An example::

$ allanon --directory="/home/keul/%HOST/%1" \
> "http://foo.org/pdf-repo-{1:10}/file{1:50}.pdf" \
> "http://baz.net/pdf-repo-{1:10}/file{1:50}.pdf"

Also the ``--directory`` option supports some of the markers: you can use ``%HOST``, ``%INDEX`` and ``%X``
with the same meaning given above.

TODO
====

Expand Down
2 changes: 1 addition & 1 deletion docs/HISTORY.txt
Expand Up @@ -10,7 +10,7 @@ Changelog
- Application specific user agent header (configurable
through ``--user-agent`` option)
- The ``--directory`` option can be a path and so create
intermediate directories
intermediate directories, and accept markers

0.1 (2013-01-05)
----------------
Expand Down
19 changes: 14 additions & 5 deletions src/allanon/main.py
Expand Up @@ -53,27 +53,35 @@
help="Directory where to store all resources that will be downloaded.\n"
"Default is the current directory.\n"
"Can be also a directory path string in nix format (like \"foo/bar\"), "
"in that case all intermediate directories will be created.")
"in that case all intermediate directories will be created.\n"
"You can use some markers for creating a dynamic name.\n"
"Use %x (%1, %2, ...) to include the current URLs range "
"(if any). Use %1 for the first range in the URL, %2 for "
"the second, and so on.\n"
"Use %HOST for include the original host where the resource has "
"been downloaded.\n"
"Use %INDEX for include a progressive number of downloaded resources.\n"
)
parser.add_option('--filename', '-f', dest="filename_model", default=None, metavar="FILENAME",
help="Download resources with a custom, dynamic, filename.\n"
"You can use some marker for creating a dynamic content.\n"
"Use %x (%1, %2, ...) for include the current URLs range "
"You can use some markers for creating a dynamic name.\n"
"Use %x (%1, %2, ...) to include the current URLs range "
"(if any). Use %1 for the first range in the URL, %2 for "
"the second, and so on.\n"
"Use %HOST for include the original host where the resource has "
"been downloaded.\n"
"Use %INDEX for include a progressive number of downloaded resources.\n"
"Use %NAME for include the original filename (without extension).\n"
"Use %EXTENSION for include the original file extensions.\n"
"Use %FULLNAME for include the original filename (with extension)\n"
"Use %FULLNAME for include the original filename (with extension).\n"
"Default is \"%FULLNAME\"")
parser.add_option("--check-duplicate", action="store_true", dest="duplicate_check", default=False,
help="When finding a duplicate filename check they are duplicates. "
"In this case, do not save the new file. Default action is to keep all "
"resources handling filename collision, without checking files content.")
parser.add_option('--user-agent', dest="user_agent", default=None, metavar="USER_AGENT",
help="Change the User-Agent header sent with every request.\n"
"Default is \"Allanon Crawler <version number>\".")
"Default is \"Allanon Crawler %s\"." % VERSION)


def main(options=None, *args):
Expand All @@ -88,6 +96,7 @@ def main(options=None, *args):
result.append(parser.get_usage())
result.append(DESCRIPTION+"\n")
result.append(parser.format_option_help(parser.formatter))
result.append('By Luca Fabbri - luca<at>keul.it\n')
result.append('See https://github.com/keul/Allanon for detailed documentation or '
'provide bug report.')
print "\n".join(result)
Expand Down
38 changes: 23 additions & 15 deletions src/allanon/resouce_grabber.py
Expand Up @@ -93,25 +93,29 @@ def _get_filename(self, filename_model=None, ids=[], index=0,
ids_digit_len=ids_digit_len,
index_digit_len=index_digit_len)
return filename

def _generate_filename_from_model(self, original, filename_model, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
filename = filename_model

def _string_interpolation(self, model, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
# replace %x with proper ids
cnt = 0
while dynaid_re.search(filename):
match = dynaid_re.search(filename)
while dynaid_re.search(model):
match = dynaid_re.search(model)
dynaid = match.group()
filename = filename.replace(dynaid,
_int_format(ids[cnt],
ids_digit_len[cnt]), 1)
model = model.replace(dynaid, _int_format(ids[cnt],
ids_digit_len[cnt]), 1)
cnt+=1
# replace %INDEX with the progressive
if filename.find("%INDEX")>-1:
filename = filename.replace("%INDEX", _int_format(index, index_digit_len))
if model.find("%INDEX")>-1:
model = model.replace("%INDEX", _int_format(index, index_digit_len))
# replace %HOST with current host
if filename.find("%HOST")>-1:
filename = filename.replace("%HOST", self.url_info.hostname)
if model.find("%HOST")>-1:
model = model.replace("%HOST", self.url_info.hostname)
return model

def _generate_filename_from_model(self, original, filename_model, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
filename = self._string_interpolation(filename_model, ids, index, ids_digit_len, index_digit_len)
# *** Other interpolation (only file specific) ***
# replace %NAME with original filename
if filename.find("%NAME")>-1:
filename = filename.replace("%NAME", original[:original.rfind('.')])
Expand All @@ -123,11 +127,13 @@ def _generate_filename_from_model(self, original, filename_model, ids=[], index=
filename = filename.replace("%FULLNAME", original)
return filename

def _create_subdirs(self, directory):
def _create_subdirs(self, directory, ids=[], index=0,
ids_digit_len=[], index_digit_len=0):
"""Given a directory name, or a directory path string in nix format
(e.g: foo/bar), create all intermediate directories.
Return the new (existing) final directory absolute path
"""
directory = self._string_interpolation(directory, ids, index, ids_digit_len, index_digit_len)
if not os.path.exists(directory):
os.makedirs(directory)
return directory
Expand All @@ -136,10 +142,12 @@ def download(self, directory, filename_model=None, ids=[], index=0,
ids_digit_len=[], index_digit_len=0, duplicate_check=False):
"""Download a remote resource. Return the new path or None if no resource has been created"""
self._open()
directory = self._create_subdirs(directory, ids=ids, index=index,
ids_digit_len=ids_digit_len,
index_digit_len=index_digit_len)
filename = self._get_filename(filename_model=filename_model, ids=ids, index=index,
ids_digit_len=ids_digit_len,
index_digit_len=index_digit_len)
directory = self._create_subdirs(directory)
path = os.path.join(directory, filename)
if duplicate_check and os.path.exists(path):
# Before trying to find a free filename, check is this file is a duplicate
Expand Down
19 changes: 19 additions & 0 deletions src/allanon/tests/acceptance_tests.py
Expand Up @@ -80,6 +80,25 @@ def dynamic_url_downloads_test(self):
self.assertTrue(self._same_content('3-foo2.pdf', 'text2.txt'))
self.assertTrue(self._same_content('4-foo2.pdf', 'text2.txt'))

def dynamic_directory_generation_test(self):
self.options.destination_directory = os.path.join(self.temp_dir, "%HOST/series-%1")
HTTPretty.register_uri(HTTPretty.GET, "http://foo.org/bar-1/file.pdf",
body=self._read_file('text1.txt'))
HTTPretty.register_uri(HTTPretty.GET, "http://foo.org/bar-2/file.pdf",
body=self._read_file('text1.txt'))
HTTPretty.register_uri(HTTPretty.GET, "http://baz.net/bar-1/file.pdf",
body=self._read_file('text1.txt'))
HTTPretty.register_uri(HTTPretty.GET, "http://baz.net/bar-2/file.pdf",
body=self._read_file('text1.txt'))
main(self.options, 'http://foo.org/bar-{1:2}/file.pdf', 'http://baz.net/bar-{1:2}/file.pdf')
self.assertEqual(self._get_downloaded_files(), ['baz.net', 'foo.org'])
self.assertEqual(listdir(os.path.join(self.temp_dir, 'baz.net')), ['series-1', 'series-2'])
self.assertEqual(listdir(os.path.join(self.temp_dir, 'foo.org')), ['series-1', 'series-2'])
self.assertTrue(self._same_content('foo.org/series-1/file.pdf', 'text1.txt'))
self.assertTrue(self._same_content('foo.org/series-2/file.pdf', 'text1.txt'))
self.assertTrue(self._same_content('baz.net/series-1/file.pdf', 'text1.txt'))
self.assertTrue(self._same_content('baz.net/series-2/file.pdf', 'text1.txt'))

# Step 2: really useful features
def inner_resources_download_test(self):
# main command line URLs set
Expand Down
7 changes: 6 additions & 1 deletion src/allanon/tests/resource_grabber_tests.py
@@ -1,7 +1,6 @@
# -*- coding: utf8 -*-

from tempfile import mkdtemp
import os
import os.path

import unittest
Expand Down Expand Up @@ -33,6 +32,12 @@ def test_create_multiple_dirs_some_exists(self):
self.assertEqual(rg._create_subdirs(os.path.join(self.directory, 'foo/bar/baz')),
os.path.join(self.directory, 'foo', 'bar', 'baz'))

def test_create_dirs_with_interpolation(self):
rg = ResourceGrabber('http://foo.com/part{1:4}/section-{10:20}/foo.pdf')
result = rg._create_subdirs(os.path.join(self.directory, '%HOST/section-%1/%2'),
ids=[2, 3], index=2,
ids_digit_len=[1, 1], index_digit_len=1)
self.assertEqual(result, os.path.join(self.directory, 'foo.com', 'section-2', '3'))

class ResourceGrabberDirectDownloadTest(unittest.TestCase):

Expand Down

0 comments on commit 1dc8042

Please sign in to comment.