# 2. Get Metadata

__by Sean Gilleran__  
__Last updated November 29__, __2021__  
[https://github.com/seangilleran/ia-compmag-collect](https://github.com/seangilleran/ia-compmag-collect)

This notebook gets metadata and a list of PDF and TXT files for each item in a collection.

## 2.1 Setup

### 2.1.1 Imports

In [None]:
from datetime import datetime
import json
import os
import uuid

import requests


def ts():
    return f"[{datetime.now().strftime('%X')}]"

### 2.1.2 Set Paths

Set metadata path and tweak URLs if necessary.

In [None]:
metadata_path = "./meta"

meta_url = "https://archive.org/metadata/{id}"
file_url = "https://archive.org/download/{id}/{file}"

## 2.2 Scrape Metadata

### 2.2.1 Find Items in Collections

Find a list of collections to scrape. Ignore if we've already got metadata.

In [None]:
# Load collections from JSON.
# TODO: Catch JSON load errors.
collections = []
for file in [f for f in os.listdir(metadata_path) if f.endswith(".json")]:
    with open(os.path.join(metadata_path, file), "r", encoding="utf-8") as f:
        collections.append(json.load(f))

# Assemble item URLs to scrape.
scrapes = []
item_count = 0
items_skipped = 0
collections_skipped = 0

for collection in collections:

    items = []
    for item in collection["items"]:
        if not isinstance(item, str):
            items_skipped = items_skipped + 1
            continue
        items.append(item)
        item_count = item_count + 1

    if len(items) == 0:
        collections_skipped = 0
        continue

    collection["items"] = items
    scrapes.append(collection)

print(
    f"Found {item_count} items ({items_skipped} skipped) " \
    f"from {len(scrapes)} collections ({collections_skipped} skipped)."
)

### 2.2.2 Scrape Metadata

In [None]:
item_count = 0
skip_count = 0
file_count = 0
file_skip_count = 0

for collection in scrapes:
    for item in collection["items"]:

        print(f"{ts()} {collection['name']}/{item}")
        item_url = meta_url.format(id=item)
        try:
            r = requests.get(item_url)
            data = r.json()
        except Exception:
            print(f"{ts()} ERR: {item_url}")
            skip_count = skip_count + 1
            continue

        # If there aren't any files we can move on.
        if "files" not in data.keys() or data["files"] == "" or len(data["files"]) == 0:
            print(f"{ts()} WRN: No items found in {collection['name']}!")
            skip_count = skip_count + 1
            continue

        # Store a list of files for scraping.
        files = []
        for file in [f for f in data["files"] if f["name"].endswith(".txt") or f["name"].endswith(".pdf")]:
            if file["format"] == "Metadata":
                file_skip_count = file_skip_count + 1
                continue
            files.append({
                "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, file["name"])),
                "name": file["name"],
                "ext": file["name"][3:],
                "format": file["format"],
                "url": file_url.format(id=item, file=file["name"]),
                "size": file["size"],
                "md5": file["md5"],
                "crc32": file["crc32"],
                "sha1": file["sha1"],
            })
            file_count = file_count = 1

        # Grab the rest of the item metadata.
        metadata = {
            "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, item)),
            "name": item,
            "title": data["metadata"].get("title", ""),
            "date": data["metadata"].get("date", ""),
            "language": data["metadata"].get("language", ""),
            "tags": data["metadata"].get("subject", []),
            "files": files,
        }

        # Save this all back to the file.
        with open(os.path.join(metadata_path, f"{collection['name']}.json"), "r", encoding="utf-8") as f:
            collection_data = json.load(f)
        for x in range(len(collection_data["items"])):
            if collection_data["items"][x] == item:
                collection_data["items"][x] = metadata
                break
        with open(os.path.join(metadata_path, f"{collection['name']}.json"), "w", encoding="utf-8") as f:
            f.write(json.dumps(collection_data, sort_keys=True, indent=4))
        item_count = item_count + 1

print(f"\n{ts()} ** DONE! **")
print(
    f"Collected metadata for {item_count} items ({skip_count} skipped) " \
    f"and {file_count} files ({file_skip_count} skipped)."
)