In [37]:
from csv import DictWriter
import glob
import json
import math
import os
import pandas as pd
import requests
from time import sleep
from tqdm import tqdm

### Get each page of image metadata in order to scrape it.

We want to start by scraping the universe of images that we _could_ get.

In [48]:
# We'll start with some config.
# The LoC website indicates that there are 170,989 images in this collection.
# But we just need a toy dataset, so we'll going to grab 10,000 images.
LOC_IMAGES_COUNT = 10000
per_page_count = 100
loc_pages_count = math.ceil(LOC_IMAGES_COUNT / 100)
url_base = "https://www.loc.gov/collections/fsa-owi-black-and-white-negatives"

# Loop to get each page.
current_page = 0
for index in tqdm(range(loc_pages_count)):
    # Let's get the current page 1-indexed rather than zero-indexed.
    current_page = index + 1

    # Let's be really nice to the Library of Congress's API; it's a public good, after all.
    sleep(1)

    # Prepare the URL query parameters.
    url_params = [
        "fo=json", # We want JSON results.
        f"c={per_page_count}", # We want 100 results at a time.
        "at=results", # We want to limit to results only.
        "sb=date", # Sort by date ascending.
        f"sp={current_page}" # We want the 1-indexed search page.
    ]

    # Make the request.
    response = requests.get(f"{url_base}?{'&'.join(url_params)}")

    # Check that we're good, then save.
    if response.status_code == 200:
        data = response.json()

        # Prepare to save the index JSON to disk.
        current_page_string = f"{current_page}".zfill(6)
        out_file_path = f"./../data/index-page-{current_page_string}.json"

        with open(out_file_path, 'w') as out_file:
            json.dump(data, out_file)
    
    # Uh oh, something went wrong.
    else:
        print(f"Request failed with non-200 status code at page {current_page}")

100%|█████████████████████████████████████████| 100/100 [04:12<00:00,  2.53s/it]


### Combine each page of index data into a single JSON file

In [49]:
# Create an empty list to hold the data which we will save as a unified JSON file.
out_data = []

# Define the pattern we'll use to glob up all the JSON files.
pattern = "./../data/index-page-*.json"

for file_path in tqdm(glob.glob(pattern)):
    with open(file_path, 'r') as in_file:
        data = json.load(in_file)
        out_data += data['results']

with open("./../data/index-full.json", "w") as out_file:
    json.dump(out_data, out_file)

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 207.56it/s]


### Save selected fields as a CSV file

In [50]:
out_data = []

with open("./../data/index-full.json", "r") as in_file:
    data = json.load(in_file)

for item in tqdm(data):

    conditions = [
        # We want there to be an id field.
        ("id" in item),

        # We want there to be an image_url field.
        ("image_url" in item),

        # We want image_url to be a list.
        (isinstance(item["image_url"], list)),

        # We want image_url to have more than zero entries with "640" pixels in it.
        (len([url for url in item["image_url"] if "640" in url]) > 0),

        # We want additional metadata.
        ("item" in item)
    ]

    should_continue = False
    for condition in conditions:
        if not condition:
            should_continue = True

    if should_continue:
        continue

    # Get who took the photo as best as we can.
    creators = []
    if "creators" in item["item"]:
        for creator in item["item"]["creators"]:
            creators.append(creator["title"])
    
    out_data.append({
        "id": item["id"],
        "image_url": [url for url in item["image_url"] if "640" in url][0],
        "date": item["date"],
        "title": item["item"]["title"],
        "creators": "; ".join(creators)
    })

# Write the result to CSV.
with open("./../data/index-full.csv", "w") as out_file:

    # Get the keys--the headers--of the CSV file.
    keys = list(out_data[0].keys())

    # Make a DictWriter.
    dict_writer = DictWriter(out_file, fieldnames=keys)

    # Write the header.
    dict_writer.writeheader()

    # Let's go!
    for row in out_data:
        dict_writer.writerow(row)

100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 477097.13it/s]


In [57]:
df = pd.read_csv("./../data/index-full.csv")
print(f"We now have a dataframe of {len(df)} images.")

out_data = []

# Iterate over the first 1,000 images (we're just making a toy dataset, here!)
for row in tqdm(df.to_dict('records')[0:2000]):
    
    # Get the image out path.
    slug = row["id"].split('/')[-2]
    image_out_path = f"./../data/image-{slug}.jpg"

    row["image_filename"] = f"{slug}.jpg"
    out_data.append(row)
    
    # If we've already scraped this path, skip.
    if os.path.exists(image_out_path):
        continue

    # Actually make the request.
    response = requests.get(row["image_url"])

    # Check that we're 200.
    if response.status_code == 200:

        # Save the image.
        with open(image_out_path, 'wb') as out_file:
            out_file.write(response.content)

    else:
        print(f"Failed on row with id: {id}.")

    # Let's be really nice to the Library of Congress's API; it's a public good, after all.
    sleep(1)

# Write the result to CSV.
with open("./../data/index.csv", "w") as out_file:

    # Get the keys--the headers--of the CSV file.
    keys = list(out_data[0].keys())

    # Make a DictWriter.
    dict_writer = DictWriter(out_file, fieldnames=keys)

    # Write the header.
    dict_writer.writeheader()

    # Let's go!
    for row in out_data:
        dict_writer.writerow(row)

We now have a dataframe of 9829 images.


100%|███████████████████████████████████████| 2000/2000 [19:57<00:00,  1.67it/s]
