# CDM IIIF Image Download

This script downloads images and metadata from Temple University Libraries' digital collections via the ContentDM IIIF API.

**Base URL:** `https://cdm16002.contentdm.oclc.org/iiif/`

# Setup

In [None]:
import urllib.request
import requests
import csv
import json
import os
import pandas as pd
from time import sleep

In [None]:
BASE_URL = "https://cdm16002.contentdm.oclc.org/iiif"
OUTPUT_DIR = "downloads"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Helper Functions

In [None]:
def fetch_json(url):
    """Fetch and parse JSON from a URL, following redirects."""
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json()


def get_metadata_value(metadata_list, label):
    """Look up a metadata field by label instead of index position."""
    for item in metadata_list:
        if item.get('label', '').lower() == label.lower():
            return item.get('value', '')
    return ''


def get_all_manifests(collection_url):
    """Get all item manifests from a collection, following pagination."""
    collection = fetch_json(collection_url)
    page_url = collection['first']
    # first can be a string URL or a dict with @id
    if isinstance(page_url, dict):
        page_url = page_url['@id']

    all_manifests = []
    while page_url:
        page = fetch_json(page_url)
        all_manifests.extend(page.get('manifests', []))
        page_url = page.get('next', None)

    return all_manifests

# 1) Browse Available Collections

Fetch the top-level manifest to see all collections available in Temple's ContentDM.

In [None]:
manifest = fetch_json(f"{BASE_URL}/manifest.json")

collections_df = pd.json_normalize(manifest['collections'])
collections_df = collections_df[['@id', 'label']]
collections_df.columns = ['URL', 'Collection']
collections_df

# 2) Download Metadata for a Collection

Set `collection_url` to the URL of the collection you want. The metadata fields available vary by collection, so this extracts all fields from each item into a flat row.

In [None]:
# Choose a collection URL from the table above
collection_url = f"{BASE_URL}/p16002coll9/manifest.json"

In [None]:
manifests = get_all_manifests(collection_url)
print(f"Found {len(manifests)} items in collection")

In [None]:
rows = []
for i, item in enumerate(manifests):
    item_manifest = fetch_json(item['@id'])
    metadata = item_manifest.get('metadata', [])
    row = {m['label']: m['value'] for m in metadata}
    row['manifest_url'] = item['@id']
    rows.append(row)
    if (i + 1) % 100 == 0:
        print(f"  Processed {i + 1}/{len(manifests)} items...")
    sleep(0.05)

metadata_df = pd.DataFrame(rows)
print(f"\nDone. {len(metadata_df)} items with {len(metadata_df.columns)} metadata fields.")
metadata_df.head()

In [None]:
csv_path = os.path.join(OUTPUT_DIR, 'collection_metadata.csv')
metadata_df.to_csv(csv_path, index=False)
print(f"Saved to {csv_path}")

# 3) Download Images from a Collection

Downloads IIIF images for the collection selected above. Images are saved to `downloads/images/`.

Set `max_images` to limit how many images to download (use `None` for all).

In [None]:
max_images = None  # set to a number to limit, or None for all
image_dir = os.path.join(OUTPUT_DIR, 'images')
os.makedirs(image_dir, exist_ok=True)

In [None]:
def download_collection_images(collection_url, image_dir, max_images=None):
    """Download IIIF images and metadata from a collection.

    Saves images as numbered JPEGs and a CSV spreadsheet mapping each
    image filename to its full metadata.
    """
    manifests = get_all_manifests(collection_url)
    count = 0
    image_records = []

    for item in manifests:
        try:
            item_manifest = fetch_json(item['@id'])
        except Exception as e:
            print(f"  Skipping {item['@id']}: {e}")
            continue

        metadata = item_manifest.get('metadata', [])
        row = {m['label']: m['value'] for m in metadata}

        for canvas in item_manifest.get('sequences', [{}])[0].get('canvases', []):
            service = canvas['images'][0]['resource'].get('service', {})
            image_id = service.get('@id', '')
            if not image_id:
                continue

            image_url = f"{image_id}/full/pct:100/0/default.jpg"
            image_filename = f"{count}.jpg"
            filepath = os.path.join(image_dir, image_filename)

            try:
                urllib.request.urlretrieve(image_url, filepath)
                record = {'filename': image_filename, 'manifest_url': item['@id']}
                record.update(row)
                image_records.append(record)
                count += 1
                if count % 10 == 0:
                    print(f"  Downloaded {count} images...")
            except Exception as e:
                print(f"  Failed to download {image_url}: {e}")
                continue

            if max_images and count >= max_images:
                print(f"Reached limit of {max_images} images.")
                break

            sleep(0.1)

        if max_images and count >= max_images:
            break

    # Save metadata spreadsheet alongside images
    df = pd.DataFrame(image_records)
    csv_path = os.path.join(image_dir, 'image_manifest.csv')
    df.to_csv(csv_path, index=False)
    print(f"Done. Downloaded {count} images to {image_dir}/")
    print(f"Metadata saved to {csv_path}")
    return df

In [None]:
results_df = download_collection_images(collection_url, image_dir, max_images=max_images)
results_df.head()

# 4) Download Metadata for All Collections

Iterates through every collection in Temple's ContentDM and saves a separate CSV for each. This takes a long time for the full set.

In [None]:
def download_all_metadata(collections_df, output_dir):
    """Download metadata CSVs for all collections."""
    metadata_dir = os.path.join(output_dir, 'metadata')
    os.makedirs(metadata_dir, exist_ok=True)

    for idx, row in collections_df.iterrows():
        collection_name = row['Collection']
        safe_name = collection_name.replace('/', '-').replace(' ', '_')[:50]
        csv_path = os.path.join(metadata_dir, f"{safe_name}.csv")

        if os.path.exists(csv_path):
            print(f"  Skipping {collection_name} (already exists)")
            continue

        print(f"Processing: {collection_name}")
        try:
            manifests = get_all_manifests(row['URL'])
        except Exception as e:
            print(f"  Failed to get manifests: {e}")
            continue

        rows = []
        for item in manifests:
            try:
                item_manifest = fetch_json(item['@id'])
                metadata = item_manifest.get('metadata', [])
                item_row = {m['label']: m['value'] for m in metadata}
                item_row['manifest_url'] = item['@id']
                rows.append(item_row)
            except Exception as e:
                print(f"  Skipping item {item.get('@id', '?')}: {e}")
            sleep(0.05)

        if rows:
            df = pd.DataFrame(rows)
            df.to_csv(csv_path, index=False)
            print(f"  Saved {len(rows)} items to {csv_path}")
        else:
            print(f"  No items found")

In [None]:
# Uncomment to run (this takes a long time for all collections)
# download_all_metadata(collections_df, OUTPUT_DIR)