# 1. Get Collections

__by Sean Gilleran__  
__Last updated November 29__, __2021__  
[https://github.com/seangilleran/ia-compmag-collect](https://github.com/seangilleran/ia-compmag-collect)

This notebook gets a list of items from each collection.

## 1.1 Setup

### 1.1.1 Set Paths

Set path to store metadata and tweak search URL if necessary.

In [None]:
metadata_path = "./meta"
collections_file = "collections.txt"

search_url = "https://archive.org/advancedsearch.php?q=collection:{collection}&fl[]=identifier&rows=999999&output=json"

### 1.1.2 Set Collections

List of collection identifiers to parse. You can load these from a text file or just type them in here as a list.

In [None]:
# List
collections = []

# File
try:
    with open(collections_file, "r", encoding="utf-8") as f:
        collections.extend([l.strip() for l in f.readlines() if l.strip != ""])
except Exception:
    print("No collections file found.")
    pass

print("\n".join(collections))

## 1.2 Scrape & Store Collections

Once everything is set up, run this cell to scrape items from collections. With that done, we can move on to collecting item metadata in the next notebook.

In [None]:
from datetime import datetime
import json
import os
import uuid

import requests

collection_count = 0
skip_count = 0
item_count = 0


# Create collections folder if it doesn't already exist.
if not os.path.exists(metadata_path):
    os.makedirs(os.path.abspath(metadata_path))


for collection in collections:

    timestamp = datetime.now().strftime("%X")
    collection_path = os.path.join(metadata_path, f"{collection}.json")

    # Skip collection if we've already scraped it.
    if os.path.exists(collection_path):
        print(f"[{timestamp}] Already scraped {collection}, skipping.")
        skip_count = skip_count + 1
        continue

    # Scrape a list of items from the collection.
    print(f"[{timestamp}] Getting items from {collection}...")
    collection_url = search_url.format(collection=collection)
    items = []
    #try:
    r = requests.get(collection_url)
    for item in r.json()["response"]["docs"]:
        items.append(item["identifier"])
        item_count = item_count + 1
    #except Exception:
    #    print(f"[{timestamp}] Error parsing {collection}, skipping.")
     #   skip_count = skip_count + 1
     #   continue

    # Skip the collection if we can't find the item metadata.
    if len(items) == 0:
        print(f"[{timestamp}] No items found in {collection}!")
        skip_count = skip_count + 1
        continue

    # Save the collection to JSON.
    collection_meta = {
        "name": collection,
        "id": str(uuid.uuid5(uuid.NAMESPACE_DNS, collection)),
        "url": collection_url,
        "items": items
    }
    with open(collection_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(collection_meta, sort_keys=True, indent=4))
    collection_count = collection_count + 1

print(f"\n[{timestamp}] ** DONE **")
print(f"Found {item_count} new items in {collection_count} collections ({skip_count} skipped).")