Skip to content

Commit

Permalink
Move from Memento to CDX search, bump to v0.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jsvine committed May 9, 2016
1 parent d661c9f commit 9563253
Show file tree
Hide file tree
Showing 12 changed files with 99 additions and 111 deletions.
17 changes: 11 additions & 6 deletions README.md
Expand Up @@ -2,10 +2,10 @@

Waybackpack is a command-line tool that lets you download the entire Wayback Machine archive for a given URL.

For instance, to download every copy of the Department of Labor's homepage before 1997, you'd run:
For instance, to download every copy of the Department of Labor's homepage through 1996 (which happens to be the first year the site was archived), you'd run:

```sh
waybackpack dol.gov -d ~/Downloads/dol-wayback --end 1997
waybackpack dol.gov -d ~/Downloads/dol-wayback --to-date 1996
```

Result:
Expand Down Expand Up @@ -42,8 +42,9 @@ pip install waybackpack

```
usage: waybackpack [-h] (-d DIR | --list) [--raw] [--root ROOT]
[--start START] [--end END] [--user-agent USER_AGENT]
[--follow-redirects] [--uniques-only] [--quiet]
[--from-date FROM_DATE] [--to-date TO_DATE]
[--user-agent USER_AGENT] [--follow-redirects]
[--uniques-only] [--collapse COLLAPSE] [--quiet]
url
positional arguments:
Expand All @@ -59,11 +60,12 @@ optional arguments:
processing by the Wayback Machine or waybackpack.
--root ROOT The root URL from which to serve snapshotted
resources. Default: 'https://web.archive.org'
--start START Timestamp-string indicating the earliest snapshot to
--from-date FROM_DATE
Timestamp-string indicating the earliest snapshot to
download. Should take the format YYYYMMDDhhss, though
you can omit as many of the trailing digits as you
like. E.g., '201501' is valid.
--end END Timestamp-string indicating the latest snapshot to
--to-date TO_DATE Timestamp-string indicating the latest snapshot to
download. Should take the format YYYYMMDDhhss, though
you can omit as many of the trailing digits as you
like. E.g., '201604' is valid.
Expand All @@ -75,6 +77,9 @@ optional arguments:
to contact. Default: 'waybackpack'.
--follow-redirects Follow redirects.
--uniques-only Download only the first version of duplicate files.
--collapse COLLAPSE An archive.org `collapse` parameter. Cf.:
https://github.com/internetarchive/wayback/blob/master
/wayback-cdx-server/README.md#collapsing
--quiet Don't log progress to stderr.
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -2,7 +2,7 @@
from setuptools import setup, find_packages
import subprocess

version = "0.2.0"
version = "0.3.0"

base_reqs = [
"requests"
Expand Down
27 changes: 27 additions & 0 deletions tests/test-cdx.py
@@ -0,0 +1,27 @@
#!/usr/bin/env python
import unittest
import waybackpack
import sys, os

class Test(unittest.TestCase):
def test_snapshot_index(self):
url = "dol.gov"
snapshots = waybackpack.search(url)
assert(len(snapshots) > 0)
assert(snapshots[0]["timestamp"] == "19961102145216")
clipped = waybackpack.search(
url,
to_date="1996"
)
assert(len(clipped) < len(snapshots))
assert(len(clipped) == 4)

def test_uniques(self):
url = "dol.gov"
uniques = waybackpack.search(
url,
to_date="1996",
uniques_only=True
)
assert(len(uniques) == 2)

4 changes: 2 additions & 2 deletions tests/test-dol.py
Expand Up @@ -6,8 +6,8 @@
class Test(unittest.TestCase):
def test_basic(self):
url = "dol.gov"
timemap = waybackpack.TimeMap(url)
timestamps = timemap.get_timestamps()
snapshots = waybackpack.search(url)
timestamps = [ snap["timestamp"] for snap in snapshots ]
first = waybackpack.Asset(url, timestamps[0])
content = first.fetch()
assert(b"Regulatory Information" in content)
Expand Down
4 changes: 2 additions & 2 deletions tests/test-download.py
Expand Up @@ -8,8 +8,8 @@
class Test(unittest.TestCase):
def test_basic(self):
url = "dol.gov"
timemap = waybackpack.TimeMap(url)
timestamps = timemap.get_timestamps().between(None, 1997)
snapshots = waybackpack.search(url, to_date=1996)
timestamps = [ snap["timestamp"] for snap in snapshots ]
pack = waybackpack.Pack(url, timestamps)
dirpath = tempfile.mkdtemp()
pack.download_to(dirpath)
Expand Down
14 changes: 0 additions & 14 deletions tests/test-timemap.py

This file was deleted.

4 changes: 2 additions & 2 deletions waybackpack/__init__.py
@@ -1,5 +1,5 @@
from .session import Session
from .timemap import TimeMap
from .asset import Asset
from .pack import Pack
__version__ = "0.2.0"
from .cdx import search
__version__ = "0.3.0"
27 changes: 27 additions & 0 deletions waybackpack/cdx.py
@@ -0,0 +1,27 @@
from .session import Session

SEARCH_URL = "https://web.archive.org/cdx/search/cdx"

def search(url,
from_date=None,
to_date=None,
uniques_only=False,
collapse=None,
session=None):

session = session or Session()
cdx = session.get(SEARCH_URL, params={
"url": url,
"from": from_date,
"to": to_date,
"showDupeCount": "true",
"output": "json",
"collapse": collapse
}).json()
fields = cdx[0]
if len(cdx) < 2: return []
snapshots = [ dict(zip(fields, row)) for row in cdx[1:] ]
if uniques_only:
return [ s for s in snapshots if int(s["dupecount"]) == 0 ]
else:
return snapshots
23 changes: 15 additions & 8 deletions waybackpack/cli.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
from .session import Session
from .pack import Pack
from .timemap import TimeMap
from .cdx import search
from .settings import DEFAULT_USER_AGENT, DEFAULT_ROOT
import argparse
import logging
Expand All @@ -27,10 +27,10 @@ def parse_args():
parser.add_argument("--root", default=DEFAULT_ROOT,
help="The root URL from which to serve snapshotted resources. Default: '{0}'".format(DEFAULT_ROOT))

parser.add_argument("--start",
parser.add_argument("--from-date",
help="Timestamp-string indicating the earliest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201501' is valid.")

parser.add_argument("--end",
parser.add_argument("--to-date",
help="Timestamp-string indicating the latest snapshot to download. Should take the format YYYYMMDDhhss, though you can omit as many of the trailing digits as you like. E.g., '201604' is valid.")

parser.add_argument("--user-agent",
Expand All @@ -45,6 +45,9 @@ def parse_args():
help="Download only the first version of duplicate files.",
action="store_true")

parser.add_argument("--collapse",
help="An archive.org `collapse` parameter. Cf.: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#collapsing")

parser.add_argument("--quiet",
action="store_true",
help="Don't log progress to stderr.")
Expand All @@ -61,11 +64,16 @@ def main():
user_agent=args.user_agent,
follow_redirects=args.follow_redirects
)
timemap = TimeMap(args.url)
timestamps = timemap.get_timestamps(session=session)

if args.start != None or args.end != None:
timestamps = timestamps.between(args.start, args.end)
snapshots = search(args.url,
session=session,
from_date=args.from_date,
to_date=args.to_date,
uniques_only=args.uniques_only,
collapse=args.collapse
)

timestamps = [ snap["timestamp"] for snap in snapshots ]

pack = Pack(
args.url,
Expand All @@ -78,7 +86,6 @@ def main():
args.dir,
raw=args.raw,
root=args.root,
uniques_only=args.uniques_only
)
else:
flag = "id_" if args.raw else ""
Expand Down
33 changes: 9 additions & 24 deletions waybackpack/pack.py
@@ -1,7 +1,7 @@
from .settings import DEFAULT_ROOT
from .timemap import TimeMap
from .session import Session
from .asset import Asset
from .cdx import search
import hashlib
import sys, os
import logging
Expand All @@ -16,6 +16,7 @@ class Pack(object):
def __init__(self,
url,
timestamps=None,
uniques_only=False,
session=None):

self.url = url
Expand All @@ -24,16 +25,17 @@ def __init__(self,
self.parsed_url = urlparse(self.full_url)

self.session = session or Session()
self.timestamps = timestamps or TimeMap(url).get_timestamps(session=self.session)

self.timestamps = timestamps or [ snap["timestamp"] for snap in search(
url,
uniques_only=uniques_only,
session=self.session
) ]
self.assets = [ Asset(self.url, ts) for ts in self.timestamps ]

def download_to(self, directory,
raw=False,
root=DEFAULT_ROOT,
uniques_only=False):

if uniques_only:
file_hashes = set()
root=DEFAULT_ROOT):

for asset in self.assets:
path_head, path_tail = os.path.split(self.parsed_url.path)
Expand Down Expand Up @@ -61,23 +63,6 @@ def download_to(self, directory,
root=root
)

# Check for uniqueness
if uniques_only:
if raw:
content_to_hash = content
else:
content_to_hash = asset.fetch(
session=self.session,
raw=True
)
file_hash = hashlib.sha256(content_to_hash).hexdigest()
file_hash_tuple = (asset.original_url, file_hash)
if file_hash_tuple in file_hashes:
logger.info("Duplicate file, skipping.\n")
continue
else:
file_hashes.add(file_hash_tuple)

try:
os.makedirs(filedir)
except OSError:
Expand Down
5 changes: 3 additions & 2 deletions waybackpack/session.py
Expand Up @@ -9,14 +9,15 @@ def __init__(self, follow_redirects=False, user_agent=DEFAULT_USER_AGENT):
self.follow_redirects = follow_redirects
self.user_agent = user_agent

def get(self, url):
def get(self, url, **kwargs):
headers = { "User-Agent": self.user_agent }
response_is_final = False
while (response_is_final == False):
res = requests.get(
url,
allow_redirects=self.follow_redirects,
headers=headers
headers=headers,
**kwargs
)
if res.status_code != 200:
logger.info("HTTP status code: {0}".format(res.status_code))
Expand Down
50 changes: 0 additions & 50 deletions waybackpack/timemap.py

This file was deleted.

0 comments on commit 9563253

Please sign in to comment.