diff --git a/README.md b/README.md index 39bbc89..4f0acd2 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ Waybackpack is a command-line tool that lets you download the entire Wayback Machine archive for a given URL. -For instance, to download every copy of the Department of Labor's homepage before 1997, you'd run: +For instance, to download every copy of the Department of Labor's homepage through 1996 (which happens to be the first year the site was archived), you'd run: ```sh -waybackpack dol.gov -d ~/Downloads/dol-wayback --end 1997 +waybackpack dol.gov -d ~/Downloads/dol-wayback --to-date 1996 ``` Result: @@ -42,8 +42,9 @@ pip install waybackpack ``` usage: waybackpack [-h] (-d DIR | --list) [--raw] [--root ROOT] - [--start START] [--end END] [--user-agent USER_AGENT] - [--follow-redirects] [--uniques-only] [--quiet] + [--from-date FROM_DATE] [--to-date TO_DATE] + [--user-agent USER_AGENT] [--follow-redirects] + [--uniques-only] [--collapse COLLAPSE] [--quiet] url positional arguments: @@ -59,11 +60,12 @@ optional arguments: processing by the Wayback Machine or waybackpack. --root ROOT The root URL from which to serve snapshotted resources. Default: 'https://web.archive.org' - --start START Timestamp-string indicating the earliest snapshot to + --from-date FROM_DATE + Timestamp-string indicating the earliest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201501' is valid. - --end END Timestamp-string indicating the latest snapshot to + --to-date TO_DATE Timestamp-string indicating the latest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201604' is valid. @@ -75,6 +77,9 @@ optional arguments: to contact. Default: 'waybackpack'. --follow-redirects Follow redirects. --uniques-only Download only the first version of duplicate files. + --collapse COLLAPSE An archive.org `collapse` parameter. Cf.: + https://github.com/internetarchive/wayback/blob/master + /wayback-cdx-server/README.md#collapsing --quiet Don't log progress to stderr. ``` diff --git a/setup.py b/setup.py index 57d7993..107f6c0 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages import subprocess -version = "0.2.0" +version = "0.3.0" base_reqs = [ "requests" diff --git a/tests/test-cdx.py b/tests/test-cdx.py new file mode 100644 index 0000000..1b2400b --- /dev/null +++ b/tests/test-cdx.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import unittest +import waybackpack +import sys, os + +class Test(unittest.TestCase): + def test_snapshot_index(self): + url = "dol.gov" + snapshots = waybackpack.search(url) + assert(len(snapshots) > 0) + assert(snapshots[0]["timestamp"] == "19961102145216") + clipped = waybackpack.search( + url, + to_date="1996" + ) + assert(len(clipped) < len(snapshots)) + assert(len(clipped) == 4) + + def test_uniques(self): + url = "dol.gov" + uniques = waybackpack.search( + url, + to_date="1996", + uniques_only=True + ) + assert(len(uniques) == 2) + diff --git a/tests/test-dol.py b/tests/test-dol.py index c0a9e8e..494b0ea 100644 --- a/tests/test-dol.py +++ b/tests/test-dol.py @@ -6,8 +6,8 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" - timemap = waybackpack.TimeMap(url) - timestamps = timemap.get_timestamps() + snapshots = waybackpack.search(url) + timestamps = [ snap["timestamp"] for snap in snapshots ] first = waybackpack.Asset(url, timestamps[0]) content = first.fetch() assert(b"Regulatory Information" in content) diff --git a/tests/test-download.py b/tests/test-download.py index f36f1dc..3e4274c 100644 --- a/tests/test-download.py +++ b/tests/test-download.py @@ -8,8 +8,8 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" - timemap = waybackpack.TimeMap(url) - timestamps = timemap.get_timestamps().between(None, 1997) + snapshots = waybackpack.search(url, to_date=1996) + timestamps = [ snap["timestamp"] for snap in snapshots ] pack = waybackpack.Pack(url, timestamps) dirpath = tempfile.mkdtemp() pack.download_to(dirpath) diff --git a/tests/test-timemap.py b/tests/test-timemap.py deleted file mode 100644 index 221570f..0000000 --- a/tests/test-timemap.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python -import unittest -import waybackpack -import sys, os - -class Test(unittest.TestCase): - def test_snapshot_index(self): - tm = waybackpack.TimeMap("dol.gov") - ts = tm.get_timestamps() - assert(len(ts) > 0) - assert(ts[0] == "19961102145216") - clipped = ts.between("1996", "1997") - assert(len(clipped) < len(ts)) - assert(len(clipped) == 4) diff --git a/waybackpack/__init__.py b/waybackpack/__init__.py index 96e81da..b254ac1 100644 --- a/waybackpack/__init__.py +++ b/waybackpack/__init__.py @@ -1,5 +1,5 @@ from .session import Session -from .timemap import TimeMap from .asset import Asset from .pack import Pack -__version__ = "0.2.0" +from .cdx import search +__version__ = "0.3.0" diff --git a/waybackpack/cdx.py b/waybackpack/cdx.py new file mode 100644 index 0000000..7578849 --- /dev/null +++ b/waybackpack/cdx.py @@ -0,0 +1,27 @@ +from .session import Session + +SEARCH_URL = "https://web.archive.org/cdx/search/cdx" + +def search(url, + from_date=None, + to_date=None, + uniques_only=False, + collapse=None, + session=None): + + session = session or Session() + cdx = session.get(SEARCH_URL, params={ + "url": url, + "from": from_date, + "to": to_date, + "showDupeCount": "true", + "output": "json", + "collapse": collapse + }).json() + fields = cdx[0] + if len(cdx) < 2: return [] + snapshots = [ dict(zip(fields, row)) for row in cdx[1:] ] + if uniques_only: + return [ s for s in snapshots if int(s["dupecount"]) == 0 ] + else: + return snapshots diff --git a/waybackpack/cli.py b/waybackpack/cli.py index 20f0b68..bbcc439 100644 --- a/waybackpack/cli.py +++ b/waybackpack/cli.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from .session import Session from .pack import Pack -from .timemap import TimeMap +from .cdx import search from .settings import DEFAULT_USER_AGENT, DEFAULT_ROOT import argparse import logging @@ -27,10 +27,10 @@ def parse_args(): parser.add_argument("--root", default=DEFAULT_ROOT, help="The root URL from which to serve snapshotted resources. Default: '{0}'".format(DEFAULT_ROOT)) - parser.add_argument("--start", + parser.add_argument("--from-date", help="Timestamp-string indicating the earliest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201501' is valid.") - parser.add_argument("--end", + parser.add_argument("--to-date", help="Timestamp-string indicating the latest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201604' is valid.") parser.add_argument("--user-agent", @@ -45,6 +45,9 @@ def parse_args(): help="Download only the first version of duplicate files.", action="store_true") + parser.add_argument("--collapse", + help="An archive.org `collapse` parameter. Cf.: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#collapsing") + parser.add_argument("--quiet", action="store_true", help="Don't log progress to stderr.") @@ -61,11 +64,16 @@ def main(): user_agent=args.user_agent, follow_redirects=args.follow_redirects ) - timemap = TimeMap(args.url) - timestamps = timemap.get_timestamps(session=session) - if args.start != None or args.end != None: - timestamps = timestamps.between(args.start, args.end) + snapshots = search(args.url, + session=session, + from_date=args.from_date, + to_date=args.to_date, + uniques_only=args.uniques_only, + collapse=args.collapse + ) + + timestamps = [ snap["timestamp"] for snap in snapshots ] pack = Pack( args.url, @@ -78,7 +86,6 @@ def main(): args.dir, raw=args.raw, root=args.root, - uniques_only=args.uniques_only ) else: flag = "id_" if args.raw else "" diff --git a/waybackpack/pack.py b/waybackpack/pack.py index fe9a9cf..9cecd47 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -1,7 +1,7 @@ from .settings import DEFAULT_ROOT -from .timemap import TimeMap from .session import Session from .asset import Asset +from .cdx import search import hashlib import sys, os import logging @@ -16,6 +16,7 @@ class Pack(object): def __init__(self, url, timestamps=None, + uniques_only=False, session=None): self.url = url @@ -24,16 +25,17 @@ def __init__(self, self.parsed_url = urlparse(self.full_url) self.session = session or Session() - self.timestamps = timestamps or TimeMap(url).get_timestamps(session=self.session) + + self.timestamps = timestamps or [ snap["timestamp"] for snap in search( + url, + uniques_only=uniques_only, + session=self.session + ) ] self.assets = [ Asset(self.url, ts) for ts in self.timestamps ] def download_to(self, directory, raw=False, - root=DEFAULT_ROOT, - uniques_only=False): - - if uniques_only: - file_hashes = set() + root=DEFAULT_ROOT): for asset in self.assets: path_head, path_tail = os.path.split(self.parsed_url.path) @@ -61,23 +63,6 @@ def download_to(self, directory, root=root ) - # Check for uniqueness - if uniques_only: - if raw: - content_to_hash = content - else: - content_to_hash = asset.fetch( - session=self.session, - raw=True - ) - file_hash = hashlib.sha256(content_to_hash).hexdigest() - file_hash_tuple = (asset.original_url, file_hash) - if file_hash_tuple in file_hashes: - logger.info("Duplicate file, skipping.\n") - continue - else: - file_hashes.add(file_hash_tuple) - try: os.makedirs(filedir) except OSError: diff --git a/waybackpack/session.py b/waybackpack/session.py index 3281db2..a46cbe8 100644 --- a/waybackpack/session.py +++ b/waybackpack/session.py @@ -9,14 +9,15 @@ def __init__(self, follow_redirects=False, user_agent=DEFAULT_USER_AGENT): self.follow_redirects = follow_redirects self.user_agent = user_agent - def get(self, url): + def get(self, url, **kwargs): headers = { "User-Agent": self.user_agent } response_is_final = False while (response_is_final == False): res = requests.get( url, allow_redirects=self.follow_redirects, - headers=headers + headers=headers, + **kwargs ) if res.status_code != 200: logger.info("HTTP status code: {0}".format(res.status_code)) diff --git a/waybackpack/timemap.py b/waybackpack/timemap.py deleted file mode 100644 index 6d6fe75..0000000 --- a/waybackpack/timemap.py +++ /dev/null @@ -1,50 +0,0 @@ -from .session import Session -from .asset import Asset -import re -import time -import datetime -import sys, os -import logging -logger = logging.getLogger(__name__) - -MEMENTO_TEMPLATE = "https://web.archive.org/web/timemap/link/{url}" -MEMENTO_TIMESTAMP_PAT = re.compile(r"^= str(start)) or start == None) and - ((t < str(end)) or end == None) - ) - - return self.__class__(filter(test_timestamp, self)) - - def soonest_after(self, timestamp): - after = self.between(start=timestamp) - if len(after) > 0: - return after[0] - else: - return None - -class TimeMap(object): - def __init__(self, url): - self.url = url - self._timestamps = None - - def get_timestamps(self, session=None): - if self._timestamps != None: return self._timestamps - session = session or Session() - url = MEMENTO_TEMPLATE.format(url=self.url) - memento = session.get(url).content.decode("utf-8") - lines = memento.split("\n") - matches_gen = (re.search(MEMENTO_TIMESTAMP_PAT, line) for line in lines) - matches = filter(None, matches_gen) - timestamps = TimestampList(m.group(1) for m in matches) - self._timestamps = timestamps - return self._timestamps