In [41]:
import csv
import json
import requests
import os

In [42]:
endpoint = 'https://www.loc.gov/free-to-use'
parameters = {
    'fo' : 'json'
}

In [43]:
collection = 'libraries'

In [44]:
collection_list_response = requests.get(endpoint + '/' + collection, params=parameters)

In [45]:
collection_list_response.url

'https://www.loc.gov/free-to-use/libraries?fo=json'

In [46]:
collection_json = collection_list_response.json()

local_json_collection = '/Users/julia.cave.arbanas/si676-2024-data/collection_project/collection_list_response.json'

with open(local_json_collection, 'w', encoding='utf-8', newline='') as f:
    json.dump(collection_json, f, ensure_ascii=False, indent=4)
print(f'JSON data saved')

JSON data saved


In [47]:
collection_json.keys()

dict_keys(['breadcrumbs', 'content', 'content_is_post', 'description', 'disable_max_line_length', 'expert_resources', 'manifest', 'next', 'next_sibling', 'options', 'pages', 'portal', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type'])

In [48]:
for k in collection_json['content']['set']['items']:
    print(k)

{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-1.jpg', 'link': '/resource/cph.3f05183/', 'title': 'For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division'}
{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-2.jpg', 'link': '/resource/highsm.20336/', 'title': 'Noyes Library for Young Children. Kensington, Maryland. Photo by Carol M. Highsmith,  2011. Prints & Photographs Division'}
{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-3.jpg', 'link': '/resource/fsa.8d24709/', 'title': 'Bethune-Cookman College. Students in the library reading room, Daytona Beach, Florida. Gordon Parks, 1943. Prints & Photographs Division'}
{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-4.jpg', 'link': '/resource/highsm.36052/', 'title': 'Public library in Antonito,  Colorado, near the New Mexico border. Photo by Carol M. Highsmith,

In [49]:
len(collection_json['content']['set']['items'])

62

In [50]:
collection_json['content']['set']['items'][0].keys()

dict_keys(['image', 'link', 'title'])

In [51]:
collection_set_list = '/Users/julia.cave.arbanas/si676-2024-data/collection_project/ftu-libraries-set-list.csv'
headers = ['image','link','title']

with open(collection_set_list, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for item in collection_json['content']['set']['items']:
        item['title'] = item['title'].rstrip()
        writer.writerow(item)
    print('wrote',collection_set_list)

wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/ftu-libraries-set-list.csv


Part 2: Getting Metadata and Content

In [52]:
import csv
import json
import requests

import os
from os.path import join

In [53]:
print(os.getcwd())

/Users/julia.cave.arbanas/si676-2024-data


In [54]:
def regenerate_collection_list(collection_csv):
    """
    Reads a CSV file and returns the data as a dictionary.
    
    Parameters:
    collection_csv (str): The path to the CSV file

    Returns:
    dict: A dictionary where each key is a column header and each value is a list of column values.
    """

    coll_items = list()

    with open(collection_csv, 'r', newline='', encoding='utf-8') as f:
        data = csv.DictReader(f)

        for row in data:
            row_dict = dict()
            for field in data.fieldnames:
                row_dict[field] = row[field]
            coll_items.append(row_dict)

        return coll_items

In [55]:
collection_csv = os.path.join('collection_project', 'ftu-libraries-set-list.csv')

collection_set_list = regenerate_collection_list(collection_csv)

In [56]:
collection_set_list[0]

{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-1.jpg',
 'link': '/resource/cph.3f05183/',
 'title': 'For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division'}

In [57]:
baseURL = 'https://www.loc.gov'
parameters = {
    'fo' : 'json'
}

In [58]:
import os
from os.path import join

base_directory = os.path.expanduser('~/si676-2024-data')
item_metadata_directory = join(base_directory, 'collection_project', 'item_metadata')

if os.path.isdir(item_metadata_directory):
    print(item_metadata_directory,'exists')
else:
    os.mkdir(item_metadata_directory)
    print('created',item_metadata_directory)

/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata exists


In [59]:
item_count = 0
error_count = 0
file_count = 0

base_directory = os.path.expanduser('~/si676-2024-data')
item_metadata_directory = os.path.join(base_directory, 'collection_project', 'item_metadata')

if not os.path.exists(item_metadata_directory):
    os.makedirs(item_metadata_directory)
    print('Created directory:', item_metadata_directory)

data_directory = 'collection_project'
item_metadata_directory = 'item_metadata'
item_metadata_file_prefix = 'item_metadata'
json_suffix = '.json'

for item in collection_set_list:
    if item['link'] == 'link':
        continue
    if '?' in item['link']:
        resource_ID = item['link']
        short_ID = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + resource_ID, params={'fo':'json'})
        print('requested',item_metadata.url,item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested',item_metadata.url,item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print('no json found')
            continue
        fout = os.path.join(base_directory,data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_ID + json_suffix))
        with open(fout, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', fout)
        item_count += 1
    else:
        resource_ID = item['link']
        short_ID = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + resource_ID, params={'fo':'json'})
        print('requested',item_metadata.url,item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested',item_metadata.url,item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print('no json found')
            continue
        fout = os.path.join(base_directory,data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_ID + json_suffix))
        with open(fout, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', fout)
        item_count += 1

print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/cph.3f05183/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-cph.3f05183.json
requested https://www.loc.gov/resource/highsm.20336/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-highsm.20336.json
requested https://www.loc.gov/resource/fsa.8d24709/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-fsa.8d24709.json
requested https://www.loc.gov/resource/highsm.36052/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-highsm.36052.json
requested https://www.loc.gov/resource/highsm.51772/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-highsm.51772.json
requested https://www.loc.gov/resource/cph.3b43255/?fo=json 200
wrote /Users/julia.cave.arbanas/si676-2024-data/co

In [61]:
base_directory = os.path.expanduser('~/si676-2024-data')
project_dir = 'collection_project'
files_dir = 'item_files'
metadata_dir = 'item_metadata'

files_loc = os.path.join(base_directory,project_dir,files_dir)
print('Checking for',files_loc)

# check directory
if os.path.isdir(files_loc):
    print('Files directory exists')
else:
    os.mkdir(files_loc)
    print('Created file directory:',files_loc)

Checking for /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_files
Files directory exists


In [62]:
import glob

In [63]:
base_directory = os.path.expanduser('~/si676-2024-data')

search_for_metadata_here = os.path.join(base_directory,project_dir,metadata_dir)

print(search_for_metadata_here)

metadata_file_list = glob.glob(search_for_metadata_here + '/*.json')

print(metadata_file_list)


/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata
['/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-cph.3c18157.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-ppbd.00600.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-mrg.00785.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-cph.3f05183.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-g3851e.ct006252.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-highsm.43863.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-ppmsca.18016.json', '/Users/julia.cave.arbanas/si676-2024-data/collection_project/item_metadata/item_metadata-highsm.20497.json', '/Users/julia.cave.arbanas/si676-2024-data/colle

In [64]:
item_image_urls = list()
count = 0

for item in metadata_file_list:
    with open(item, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
        # noted this resource for working out index out of range errors: https://rollbar.com/blog/how-to-fix-python-list-index-out-of-range-error-in-for-loops/
        image_url_no = len(metadata['image_url'])
        image_url = metadata['image_url'][-1]
        item_image_urls.append(image_url)
        count += 1

print(f'Identified { str(count) } image URLs')

Identified 59 image URLs


In [65]:
item_image_urls

['https://tile.loc.gov/storage-services/service/pnp/cph/3c10000/3c18000/3c18100/3c18157v.jpg#h=824&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/ppbd/00600/00600v.jpg#h=1024&w=765',
 'https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00785v.jpg#h=697&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05183v.jpg#h=1024&w=705',
 'https://tile.loc.gov/image-services/iiif/service:gmd:gmd385:g3851:g3851e:ct006252/full/pct:25/0/default.jpg#h=1205&w=1684',
 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:43800:43863/full/pct:25/0/default.jpg#h=2117&w=2822',
 'https://tile.loc.gov/storage-services/service/pnp/ppmsca/18000/18016v.jpg#h=755&w=1024',
 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:20400:20497/full/pct:50/0/default.jpg#h=2395&w=2053',
 'https://tile.loc.gov/storage-services/service/pnp/fsa/8b14000/8b14100/8b14169v.jpg#h=783&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/ppmsca

In [66]:
collection_set_list_with_images = list()

for item in metadata_file_list:
    with open(item, 'r', encoding='utf-8') as item_info:
        item_metadata = json.load(item_info)

        item_metadata_dict = dict()
        item_metadata_dict['item_URI'] = item_metadata['id']
        try:
            item_metadata_dict['lccn'] = item_metadata['library_of_congress_control_number']
        except:
            item_metadata_dict['lccn'] = None
        item_metadata_dict['title'] = item_metadata['title']
        item_metadata_dict['image_URL_large'] = item_metadata['image_url'][-1]
        
        collection_set_list_with_images.append(item_metadata_dict)

print(collection_set_list_with_images[0])

{'item_URI': 'http://www.loc.gov/item/97511671/', 'lccn': '97511671', 'title': 'Carnegie Library, Sheldon, Iowa', 'image_URL_large': 'https://tile.loc.gov/storage-services/service/pnp/cph/3c10000/3c18000/3c18100/3c18157v.jpg#h=824&w=1024'}


In [68]:
item_count = 0
error_count = 0
file_count = 0

img_file_prefix = 'img_'

base_directory = os.path.expanduser('~/si676-2024-data')

for item in collection_set_list_with_images:
        image_URL = item['image_URL_large']
        short_ID = item['item_URI'].split('/')[-2]
        print('... requesting',image_URL)
        item_count += 1

        r = requests.get(image_URL)
        if r.status_code == 200:
            img_out = os.path.join(base_directory,project_dir,files_dir,str(img_file_prefix + short_ID + '.jpg'))
            with open(img_out, 'wb') as file:
                file.write(r.content)
                print('Saved',img_out)
                file_count += 1


print('--- mini LOG ---')
print('files requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

... requesting https://tile.loc.gov/storage-services/service/pnp/cph/3c10000/3c18000/3c18100/3c18157v.jpg#h=824&w=1024
Saved /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_files/img_97511671.jpg
... requesting https://tile.loc.gov/storage-services/service/pnp/ppbd/00600/00600v.jpg#h=1024&w=765
Saved /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_files/img_2015647967.jpg
... requesting https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00785v.jpg#h=697&w=1024
Saved /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_files/img_2017702899.jpg
... requesting https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05183v.jpg#h=1024&w=705
Saved /Users/julia.cave.arbanas/si676-2024-data/collection_project/item_files/img_98508155.jpg
... requesting https://tile.loc.gov/image-services/iiif/service:gmd:gmd385:g3851:g3851e:ct006252/full/pct:25/0/default.jpg#h=1205&w=1684
Saved /Users/julia.cave.arbanas/si676-2024