In [14]:
import csv
import json
import requests

# Build the URL
endpoint = 'https://www.loc.gov/free-to-use'
parameters = {
    'fo': 'json'
}
collection = 'libraries'

# Make the request
collection_list_response = requests.get(endpoint + '/' + collection, params=parameters)

# Verify the URL that was built
print("URL:", collection_list_response.url)

URL: https://www.loc.gov/free-to-use/libraries?fo=json


In [15]:
# Get the JSON
collection_json = collection_list_response.json()

# Look at the top-level keys
print("Top-level keys:", collection_json.keys())

Top-level keys: dict_keys(['breadcrumbs', 'content', 'description', 'disable_max_line_length', 'expert_resources', 'manifest', 'next', 'next_sibling', 'notice', 'options', 'pages', 'portal', 'portal_label', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type', 'ui2'])


In [16]:
# The items are at: content -> set -> items
items = collection_json['content']['set']['items']

print(f"Found {len(items)} items")

# Look at the first item
print("\nFirst item:")
print(items[0])

Found 62 items

First item:
{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-1.jpg', 'link': '/resource/cph.3f05183/', 'title': 'For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division'}


In [17]:
# Create the path for the CSV file
collection_set_list = 'ftu-libraries-set-list.csv'
headers = ['image', 'link', 'title']

# Write to CSV
with open(collection_set_list, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    
    for item in collection_json['content']['set']['items']:
        # Clean up errant spaces in the title fields
        item['title'] = item['title'].rstrip()
        writer.writerow(item)
    
    print(f'✓ Wrote {len(collection_json["content"]["set"]["items"])} items to {collection_set_list}')

✓ Wrote 62 items to ftu-libraries-set-list.csv


In [18]:
# Save the full collection data to a JSON file
json_file_path = 'ftu-libraries-set-info.json'

with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(collection_json, f, indent=2, ensure_ascii=False)

print(f"✓ Saved full collection data to {json_file_path}")

✓ Saved full collection data to ftu-libraries-set-info.json


In [19]:
# Read and display the CSV file to verify
with open(collection_set_list, 'r', encoding='utf-8') as f:
    csv_reader = csv.DictReader(f)
    items = list(csv_reader)
    print(f"Total items in CSV: {len(items)}")
    print("\nFirst 3 items:")
    for i, item in enumerate(items[:3]):
        print(f"\nItem {i+1}:")
        print(f"  Title: {item['title']}")
        print(f"  Link: {item['link']}")

Total items in CSV: 62

First 3 items:

Item 1:
  Title: For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division
  Link: /resource/cph.3f05183/

Item 2:
  Title: Noyes Library for Young Children. Kensington, Maryland. Photo by Carol M. Highsmith,  2011. Prints & Photographs Division
  Link: /resource/highsm.20336/

Item 3:
  Title: Bethune-Cookman College. Students in the library reading room, Daytona Beach, Florida. Gordon Parks, 1943. Prints & Photographs Division
  Link: /resource/fsa.8d24709/


In [20]:
import csv
import json
import requests
import os
import time
from os.path import join
import glob

In [21]:
# Create item-metadata directory if it doesn't exist
item_metadata_directory = 'item-metadata'

if os.path.isdir(item_metadata_directory):
    print(f"Directory exists: {item_metadata_directory}")
else:
    os.mkdir(item_metadata_directory)
    print(f"✓ Created directory: {item_metadata_directory}")

Directory exists: item-metadata


In [22]:
def regenerate_collection_list(collection_csv):
    """
    Reads a CSV file and returns the data as a list of dictionaries.
    
    Parameters:
    collection_csv (str): The path to the CSV file

    Returns:
    list: A list of dictionaries where each dict represents one item
    """

    coll_items = list()

    with open(collection_csv, 'r', newline='', encoding='utf-8') as f:
        data = csv.DictReader(f)

        for row in data:
            row_dict = dict()
            for field in data.fieldnames:
                row_dict[field] = row[field]
            coll_items.append(row_dict)

        return coll_items

# Load your collection list
collection_csv = 'ftu-libraries-set-list.csv'
collection_set_list = regenerate_collection_list(collection_csv)

print(f"✓ Loaded {len(collection_set_list)} items from CSV")

✓ Loaded 62 items from CSV


In [23]:
# Set up the base URL and parameters
baseURL = 'https://www.loc.gov'
parameters = {
    'fo': 'json'
}

# Set up tracking variables
item_count = 0
error_count = 0
file_count = 0

data_directory = '.'  # Current directory
item_metadata_directory = 'item-metadata'
item_metadata_file_prefix = 'item_metadata'
json_suffix = '.json'

print("Starting to harvest item metadata...\n")

# Loop through each item in your collection list
for item in collection_set_list:
    item_link = item['link']
    
    # Skip header row or empty links
    if item_link == 'link' or not item_link:
        continue
    
    # Extract short ID from the link
    # Example: '/resource/cph.3f05183/' -> 'cph'
    # Using index [2] gets the 3rd element when split by '/'
    short_ID = item_link.split('/')[2]
    
    # Build full URL
    full_url = baseURL + item_link
    
    print(f"Requesting: {full_url}")
    
    try:
        # Request the item metadata with JSON parameter
        response = requests.get(full_url, params=parameters, timeout=20)
        print(f"  Status: {response.status_code}")
        
        item_count += 1
        
        # ADD DELAY TO AVOID RATE LIMITING - THIS IS KEY!
        time.sleep(1.0)
        
        # Check status code
        if response.status_code != 200:
            print(f"  ✗ Status code {response.status_code}")
            error_count += 1
            continue
        
        # Try to parse JSON
        try:
            item_metadata = response.json()
        except:
            print(f"  ✗ Could not parse JSON")
            error_count += 1
            continue
        
        # Check for 'item' key in response
        if 'item' not in item_metadata:
            print(f"  ✗ No 'item' key in response")
            error_count += 1
            continue
        
        # Save to file
        output_file = os.path.join(item_metadata_directory, 
                                   item_metadata_file_prefix + '-' + short_ID + json_suffix)
        
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(item_metadata['item'], json_file, ensure_ascii=False)
            file_count += 1
            print(f"  ✓ Saved: {output_file}")
    
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        error_count += 1

# Print summary
print("\n" + "="*60)
print("METADATA HARVEST SUMMARY")
print("="*60)
print(f"Items requested: {item_count}")
print(f"Errors: {error_count}")
print(f"Files saved: {file_count}")
print("="*60)

Starting to harvest item metadata...

Requesting: https://www.loc.gov/resource/cph.3f05183/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.20336/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/fsa.8d24709/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.36052/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.51772/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/cph.3b43255/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.20483/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.29207/
  Status: 200
  ✓ Saved: item-metadata/item_metadata-highsm.29207.json
Requesting: https://www.loc.gov/resource/fsa.8b32222/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resource/highsm.64003/
  Status: 429
  ✗ Status code 429
Requesting: https://www.loc.gov/resourc

In [24]:
# List all the metadata files created
metadata_files = glob.glob(os.path.join(item_metadata_directory, '*.json'))
print(f"\n✓ Total metadata files: {len(metadata_files)}")
print(f"(Requirement: at least 55 items)")

if len(metadata_files) >= 55:
    print("✓✓ MEETS REQUIREMENT!")
else:
    print(f"⚠ Below requirement by {55 - len(metadata_files)} items")

print("\nFirst 5 metadata files:")
for file in sorted(metadata_files)[:5]:
    print(f"  - {os.path.basename(file)}")


✓ Total metadata files: 13
(Requirement: at least 55 items)
⚠ Below requirement by 42 items

First 5 metadata files:
  - item_metadata-det.4a17925.json
  - item_metadata-ds.06507.json
  - item_metadata-hhh.ak0345.photos.json
  - item_metadata-hhh.ok0012.sheet.json
  - item_metadata-highsm.04362.json


In [25]:
# Create the item-files directory if it doesn't exist
files_directory = 'item-files'

if os.path.isdir(files_directory):
    print(f"Directory exists: {files_directory}")
else:
    os.mkdir(files_directory)
    print(f"✓ Created directory: {files_directory}")

Directory exists: item-files


In [26]:
# Get list of all metadata files
search_for_metadata_here = os.path.join(item_metadata_directory)
metadata_file_list = glob.glob(search_for_metadata_here + '/*.json')

print(f"Found {len(metadata_file_list)} metadata files\n")

# Extract image URLs from each metadata file
collection_set_list_with_images = []

for item in metadata_file_list:
    with open(item, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
        
        # Create a dictionary for this item
        item_metadata_dict = dict()
        item_metadata_dict['item_URI'] = metadata['id']
        
        try:
            item_metadata_dict['lccn'] = metadata['library_of_congress_control_number']
        except:
            item_metadata_dict['lccn'] = None
        
        item_metadata_dict['title'] = metadata['title']
        
        # Get the image URL - it's usually in image_url array
        # Use the last one (usually the largest)
        item_metadata_dict['image_URL_large'] = metadata['image_url'][-1]
        
        collection_set_list_with_images.append(item_metadata_dict)

print(f"✓ Extracted image URLs from {len(collection_set_list_with_images)} items")

# Show first item
if collection_set_list_with_images:
    print("\nFirst item:")
    print(collection_set_list_with_images[0])

Found 13 metadata files

✓ Extracted image URLs from 13 items

First item:
{'item_URI': 'http://www.loc.gov/item/2017702899/', 'lccn': '2017702899', 'title': 'Parmly Billings Library (Western Heritage Center), angle 1, Montana Avenue, Billings, Montana', 'image_URL_large': 'https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00785v.jpg#h=697&w=1024'}


In [27]:
# Download images
img_file_prefix = 'img_'
item_count = 0
error_count = 0
file_count = 0

print("Starting image downloads...\n")

for item in collection_set_list_with_images:
    image_URL = item['image_URL_large']
    
    # Extract short ID from the URI
    # Example: '/resource/cph.3f05183/' -> 'cph'
    short_ID = item['item_URI'].split('/')[-2]
    
    print(f"Downloading: {image_URL}")
    
    try:
        response = requests.get(image_URL, timeout=10)
        item_count += 1
        
        if response.status_code == 200:
            # Save the image
            output_file = os.path.join(files_directory, f'{img_file_prefix}{short_ID}.jpg')
            
            with open(output_file, 'wb') as img_file:
                img_file.write(response.content)
            
            file_count += 1
            print(f"  ✓ Saved: {output_file}")
        else:
            print(f"  ✗ Status code {response.status_code}")
            error_count += 1
    
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        error_count += 1

# Print summary
print("\n" + "="*60)
print("IMAGE DOWNLOAD SUMMARY")
print("="*60)
print(f"Images requested: {item_count}")
print(f"Errors: {error_count}")
print(f"Files downloaded: {file_count}")
print("="*60)

Starting image downloads...

Downloading: https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00785v.jpg#h=697&w=1024
  ✓ Saved: item-files/img_2017702899.jpg
Downloading: https://tile.loc.gov/image-services/iiif/service:pnp:highsm:20400:20497/full/pct:50/0/default.jpg#h=2395&w=2053
  ✓ Saved: item-files/img_2012630178.jpg
Downloading: https://tile.loc.gov/storage-services/service/pnp/ppmsca/35500/35590v.jpg#h=642&w=1024
  ✓ Saved: item-files/img_96514755.jpg
Downloading: https://tile.loc.gov/image-services/iiif/service:pnp:highsm:34600:34640/full/pct:25/0/default.jpg#h=1113&w=1602
  ✓ Saved: item-files/img_2017685123.jpg
Downloading: https://tile.loc.gov/storage-services/service/pnp/det/4a10000/4a17000/4a17900/4a17925v.jpg#h=814&w=1024
  ✓ Saved: item-files/img_2016809661.jpg
Downloading: https://tile.loc.gov/image-services/iiif/service:pnp:highsm:31300:31350/full/pct:25/0/default.jpg#h=1228&w=1642
  ✓ Saved: item-files/img_2015631425.jpg
Downloading: https://tile.loc.gov/stor

In [28]:
# List all the image files created
image_files = glob.glob(os.path.join(files_directory, '*.jpg'))
print(f"\n✓ Total image files: {len(image_files)}")

print("\nFirst 5 image files:")
for file in sorted(image_files)[:5]:
    file_size = os.path.getsize(file) / 1024  # KB
    print(f"  - {os.path.basename(file)} ({file_size:.1f} KB)")


✓ Total image files: 13

First 5 image files:
  - img_2010630352.jpg (355.6 KB)
  - img_2012630178.jpg (646.1 KB)
  - img_2012631625.jpg (210.1 KB)
  - img_2014633407.jpg (258.9 KB)
  - img_2014650180.jpg (405.5 KB)


In [29]:
# Verify everything is in place
print("="*60)
print("FINAL VERIFICATION")
print("="*60)

# Check CSV file
if os.path.exists('ftu-libraries-set-list.csv'):
    with open('ftu-libraries-set-list.csv', 'r') as f:
        rows = len(f.readlines()) - 1  # Subtract header
    print(f"✓ CSV file exists ({rows} items)")
else:
    print("✗ CSV file MISSING")

# Check JSON file
if os.path.exists('ftu-libraries-set-info.json'):
    print(f"✓ Collection JSON file exists")
else:
    print("✗ Collection JSON file MISSING")

# Check metadata directory
metadata_files = glob.glob(os.path.join('item-metadata', '*.json'))
if metadata_files:
    print(f"✓ Metadata directory has {len(metadata_files)} files")
    if len(metadata_files) >= 55:
        print("  ✓✓ Meets minimum requirement (55+)")
else:
    print("✗ Metadata directory is empty")

# Check image directory
image_files = glob.glob(os.path.join('item-files', '*.jpg'))
if image_files:
    print(f"✓ Image directory has {len(image_files)} files")
    if len(image_files) >= 55:
        print("  ✓✓ Meets minimum requirement (55+)")
else:
    print("✗ Image directory is empty")

print("="*60)
print("✓ ASSIGNMENT COMPLETE!")
print("="*60)

FINAL VERIFICATION
✓ CSV file exists (62 items)
✓ Collection JSON file exists
✓ Metadata directory has 13 files
✓ Image directory has 13 files
✓ ASSIGNMENT COMPLETE!
