# CHOMP v2
__Internet Archive Collector__

__by Kate Gilleran__  
__Last updated November 30__, __2021__  
[https://github.com/kwgws/chomp2](https://github.com/kwgws/chomp2)

## 1. Setup

These cells must be run before any other cells in the notebook.

### 1.1 Set Paths

In [None]:
collections_file = "collections.txt"
metadata_path = "./meta"
download_path = "./downloads"

search_url = "https://archive.org/advancedsearch.php?q=collection:{id}&fl[]=identifier&rows=999999&output=json"
meta_url = "https://archive.org/metadata/{id}"
file_url = "https://archive.org/download/{id}/{file}"

### 1.2 Set Collections

You can load these from a text file (or just type them in here as a `list`).

In [None]:
collections = []

try:
    with open(collections_file, "r", encoding="utf-8") as f:
        for line in f.readlines():
            collections.append(line.strip())
except FileNotFoundError:
    print("WRN: No collections file found.")
    pass

print(f"Loaded {len(collections)} collections.")

### 1.3 Set Download Formats

In [None]:
dl_formats = ["DjVuTXT"]

### 1.4 Set Download Date Range

In [None]:
dl_date_range = ("1975-01-01", "2005-12-31")

## 2. Get Metadata

### 2.1 Find Items in Collections

In [None]:
import json
import os
import uuid

import requests

collection_count = 0
skip_count = 0
item_count = 0


# Create collections folder if it doesn't already exist.
if not os.path.exists(metadata_path):
    os.makedirs(os.path.abspath(metadata_path))


for collection in collections:

    collection_path = os.path.join(metadata_path, f"{collection}.json")

    # Skip collection if we've already scraped it.
    if os.path.exists(collection_path):
        print(f"Already scraped {collection}, skipping.")
        skip_count = skip_count + 1
        continue

    # Scrape a list of items from the collection.
    print(f"Getting items from {collection}...")
    collection_url = search_url.format(id=collection)
    items = []
    try:
        r = requests.get(collection_url)
        for item in r.json()["response"]["docs"]:
            items.append(item["identifier"])
            item_count = item_count + 1
    except Exception:
        print(f"ERR: {collection_url}")
        skip_count = skip_count + 1
        continue

    # Skip the collection if we can't find the item metadata.
    if len(items) == 0:
        print(f"WRN: No items found in {collection}!")
        skip_count = skip_count + 1
        continue

    # Save the collection to JSON.
    collection_meta = {
        "name": collection,
        "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, collection)),
        "url": collection_url,
        "items": items,
    }
    with open(collection_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(collection_meta, sort_keys=True, indent=4))
    collection_count = collection_count + 1

print(f"\n** DONE **")
print(f"Found {item_count} new items in {collection_count} collections ({skip_count} skipped).")

### 2.2 Get Item Metadata

#### 2.2.1 Load Items

In [None]:
import json
import os
from time import sleep
import uuid

import requests


# Load collections from JSON.
# TODO: Catch JSON load errors.
collections = []
for file in [f for f in os.listdir(metadata_path) if f.endswith(".json")]:
    with open(os.path.join(metadata_path, file), "r", encoding="utf-8") as f:
        collections.append(json.load(f))

# Assemble item URLs to scrape.
scrapes = []
item_count = 0
items_skipped = 0
collections_skipped = 0

for collection in collections:

    items = []
    for item in collection["items"]:
        if not isinstance(item, str):
            items_skipped = items_skipped + 1
            continue
        items.append(item)
        item_count = item_count + 1

    if len(items) == 0:
        collections_skipped = 0
        continue

    collection["items"] = items
    scrapes.append(collection)

print(
    f"Found {item_count} items ({items_skipped} skipped) " \
    f"from {len(scrapes)} collections ({collections_skipped} skipped)."
)

#### 2.2.3 Get Metadata

In [None]:
item_count = 0
skip_count = 0
file_count = 0
file_skip_count = 0

for collection in scrapes:
    for item in collection["items"]:

        print(f"{collection['name']}/{item}")
        item_url = meta_url.format(id=item)
        try:
            r = requests.get(item_url)
            data = r.json()
        except Exception:
            print(f"ERR: {item_url}")
            skip_count = skip_count + 1
            continue

        # If there aren't any files we can move on.
        if "files" not in data.keys() or data["files"] == "" or len(data["files"]) == 0:
            print(f"WRN: No items found in {collection['name']}!")
            skip_count = skip_count + 1
            continue

        # Store a list of files for scraping.
        files = []
        for file in [f for f in data["files"] if f["name"].endswith(".txt") or f["name"].endswith(".pdf")]:
            if file["format"] == "Metadata":
                file_skip_count = file_skip_count + 1
                continue
            files.append({
                "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, file["name"])),
                "name": file["name"],
                "ext": file["name"][-3:],
                "format": file["format"],
                "url": file_url.format(id=item, file=file["name"]),
                "size": file["size"],
                "md5": file["md5"],
                "crc32": file["crc32"],
                "sha1": file["sha1"],
            })
            file_count = file_count + 1

        # Grab the rest of the item metadata.
        metadata = {
            "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, item)),
            "name": item,
            "title": data["metadata"].get("title", ""),
            "date": data["metadata"].get("date", ""),
            "language": data["metadata"].get("language", ""),
            "tags": data["metadata"].get("subject", []),
            "files": files,
        }

        # Save this all back to the file.
        with open(os.path.join(metadata_path, f"{collection['name']}.json"), "r", encoding="utf-8") as f:
            collection_data = json.load(f)
        for x in range(len(collection_data["items"])):
            if collection_data["items"][x] == item:
                collection_data["items"][x] = metadata
                break
        with open(os.path.join(metadata_path, f"{collection['name']}.json"), "w", encoding="utf-8") as f:
            f.write(json.dumps(collection_data, sort_keys=True, indent=4))
        item_count = item_count + 1

print(f"\n** DONE! **")
print(
    f"Collected metadata for {item_count} items ({skip_count} skipped) " \
    f"and {file_count} files ({file_skip_count} skipped)."
)

## 3. Download Files

### 3.1 Find Files in Metadata

In [None]:
import json
import os

import dateparser
import regex as re

re_date_full = re.compile(r"(19[6-9]\d|20[0-1]\d)-[0-1]\d")
re_date_year = re.compile(r"19[6-9]\d|20[0-1]\d")

start_date = dateparser.parse(dl_date_range[0])
end_date = dateparser.parse(dl_date_range[1])


# Load collections from JSON.
# TODO: Catch JSON load errors.
collections = []
for file in [f for f in os.listdir(metadata_path) if f.endswith(".json")]:
    with open(os.path.join(metadata_path, file), "r", encoding="utf-8") as f:
        collections.append(json.load(f))

# Assemble download URLs.
downloads = []
skip_count = 0

for collection in collections:
    for item in collection["items"]:

        # Check for metadata.
        if isinstance(item, str):
            print(f"WRN: No metadata for {item['name']}, skipping.")
            continue

        # Find date.
        date = item["date"]
        try:
            date = dateparser.parse(date, settings=dict(REQUIRE_PARTS=["month", "year"]))
            if date < start_date or date > end_date:
                skip_count = skip_count + len(item["files"])
                continue
            date = date.strftime("%Y-%m")
        except Exception:
            pass

        for file in item["files"]:

            # Check file type.
            if file["format"] not in dl_formats:
                skip_count = skip_count + 1
                continue

            # Find date (if not stored in item metadata).
            if not date or date == "":
                try:
                    date = re_date_full.search(file["name"]).group(0)
                    if date < start_date or date > end_date:
                        skip_count = skip_count + 1
                        continue
                    date = date.strftime("%Y-%m")
                except Exception:
                    skip_count = skip_count + 1
                    continue

            # Assemble a full filename.
            # NB: a handful of files have local path names boiled in somehow. These tend to
            # break things, so let's get rid of them.
            filename = file["name"][:-4].split("/")[-1].split("\\")[-1]
            filename = f"{date} - {filename} - {file['id']}.{file['ext']}"


            # Make sure we don't already have it.
            if os.path.exists(os.path.join(download_path, filename)):
                skip_count = skip_count + 1
                continue

            # Add it to the pile.
            print(f"Adding {file['url']}")
            downloads.append((file["url"], filename))

print(f"\n** DONE! **")
print(f"Found {len(downloads)} files to download ({skip_count} skipped).")


### 3.2 Download Files

In [None]:
import wget


skip_count = 0
download_count = 0

for url, filename in downloads:

    if not os.path.exists(download_path):
        os.makedirs(os.path.abspath(download_path))

    try:
        print(f"Downloading {url} ...")
        newfile = wget.download(url)
    except Exception:
        print(f"ERR: {url}")
        skip_count = skip_count + 1
        continue

    try:
        os.rename(os.path.abspath(newfile), os.path.abspath(os.path.join(download_path, filename)))
    except FileExistsError:
        print(f"ERR: {filename}")
        continue

    download_count = download_count + 1


print(f"\n** DONE! **")
print(f"Downloaded {download_count} files ({skip_count} skipped).")