Import Code

In [1]:
import csv
import json
import requests
import os
from os.path import join
import glob


Collection List URL Build


In [2]:
url = "https://www.loc.gov/free-to-use"
page = "libraries"
parameters = {"fo" : "json"}

In [3]:
cl = requests.get(url + "/" + page, params=parameters)

In [4]:
cl.url

'https://www.loc.gov/free-to-use/libraries?fo=json'

Generating / Finding JSON Information

In [5]:
cl_json = cl.json()
cl_json.keys()

dict_keys(['breadcrumbs', 'content', 'content_is_post', 'description', 'disable_max_line_length', 'expert_resources', 'manifest', 'next', 'next_sibling', 'options', 'pages', 'portal', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type'])

In [6]:
print(cl_json["content"].keys())

dict_keys(['active', 'description', 'disable_navigation', 'image_url', 'link', 'markup', 'meta_description', 'pagination', 'partof', 'set', 'slug', 'status', 'title', 'url'])


In [7]:
print(cl_json["content"]["set"].keys())

dict_keys(['items', 'layout'])


In [8]:
print(cl_json["content"]["set"]["items"])

[{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-1.jpg', 'link': '/resource/cph.3f05183/', 'title': 'For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division'}, {'image': '/static/portals/free-to-use/public-domain/libraries/libraries-2.jpg', 'link': '/resource/highsm.20336/', 'title': 'Noyes Library for Young Children. Kensington, Maryland. Photo by Carol M. Highsmith,  2011. Prints & Photographs Division'}, {'image': '/static/portals/free-to-use/public-domain/libraries/libraries-3.jpg', 'link': '/resource/fsa.8d24709/', 'title': 'Bethune-Cookman College. Students in the library reading room, Daytona Beach, Florida. Gordon Parks, 1943. Prints & Photographs Division'}, {'image': '/static/portals/free-to-use/public-domain/libraries/libraries-4.jpg', 'link': '/resource/highsm.36052/', 'title': 'Public library in Antonito,  Colorado, near the New Mexico border. Photo by Carol M. Highsm

In [9]:
cl_json["content"]["set"]["items"][0].keys()

dict_keys(['image', 'link', 'title'])

In [10]:
len(cl_json['content']['set']['items'])

62

Writing CSV File

In [11]:
cl_set_list = "../Jordan/cp/collection_set_list.csv"
headers = ['image', 'link', 'title']

with open(cl_set_list, mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    for item in cl_json['content']['set']['items']:
        writer.writerow([item['image'], item['link'], item['title']])

Harvesting Metadata

In [12]:
def regenerate_cl_csv(collection_set_list_csv):
    cl_items = list()
    with open(collection_set_list_csv, mode='r') as file:
        content = csv.DictReader(file)
        for row in content:
            row_dict = dict()
            for field in content.fieldnames:
                row_dict[field] = row[field]
            cl_items.append(row_dict)
        return cl_items


In [13]:
collection_set_list_csv = "../Jordan/cp/collection_set_list.csv"
cl_set_list = regenerate_cl_csv(collection_set_list_csv)

In [14]:
cl_set_list[0]

{'image': '/static/portals/free-to-use/public-domain/libraries/libraries-1.jpg',
 'link': '/resource/cph.3f05183/',
 'title': 'For greater knowledge, on more subjects, use your library more often. Illinois WPA Arts Project, 1936-1941. Prints & Photographs Division'}

Harvest Individual Item Metadata

In [15]:
baseURL= "https://www.loc.gov"
parameters = {"fo" : "json"}

In [16]:
# run this cell to confirm that you have a location for the JSON files
item_metadata_directory = os.path.join('..','Jordan','cp','item-metadata')

if os.path.isdir(item_metadata_directory):
    print(item_metadata_directory,'exists')
else:
    os.mkdir(item_metadata_directory)
    print('created',item_metadata_directory)

../Jordan/cp/item-metadata exists


In [17]:
len(cl_set_list)

62

In [18]:
item_count = 0
error_count = 0
file_count = 0

parent_directory = 'Jordan'
data_directory = 'cp'
item_metadata_directory = 'item-metadata'
item_metadata_file_prefix = 'item-metadata'
json_suffix = '.json'

for item in cl_set_list:
    if item['link'] == 'link':
        continue
    if '?' in item ['link']:
        item_id = item['link']
        short_id = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + item_id, params=parameters)
        print('requested', item_metadata.url, item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested', item_metadata.url, item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print ('json not found')
            continue
        file_out = os.path.join('..', parent_directory, data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_id + json_suffix))
        with open(file_out, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', file_out)
        item_count += 1
    else:
        item_id = item['link']
        short_id = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + item_id, params=parameters)
        print('requested', item_metadata.url, item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested', item_metadata.url, item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print ('json not found')
            continue
        file_out = os.path.join('..', parent_directory, data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_id + json_suffix))
        with open(file_out, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', file_out)
        item_count += 1


print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)



requested https://www.loc.gov/resource/cph.3f05183/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-cph.3f05183.json
requested https://www.loc.gov/resource/highsm.20336/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-highsm.20336.json
requested https://www.loc.gov/resource/fsa.8d24709/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-fsa.8d24709.json
requested https://www.loc.gov/resource/highsm.36052/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-highsm.36052.json
requested https://www.loc.gov/resource/highsm.51772/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-highsm.51772.json
requested https://www.loc.gov/resource/cph.3b43255/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-cph.3b43255.json
requested https://www.loc.gov/resource/highsm.20483/?fo=json 200
wrote ../Jordan/cp/item-metadata/item-metadata-highsm.20483.json
requested https://www.loc.gov/resource/highsm.29207/?fo=json 200
wrote ../Jordan/cp/item-metadat

Harvest Individual Item Images

In [19]:
key_dir = os.path.join('/','Users','jrhym','Desktop','si676-2024-data', 'data')
parent_directory = 'Jordan'
project_dir = 'cp'
image_dir = 'img-files'
metadata_dir = 'item-metadata'

img_loc = os.path.join(key_dir, parent_directory, project_dir, image_dir)
print('Checking for', img_loc)

if os.path.isdir(img_loc):
    print('Image directory exists')
else:
    os.mkdir(img_loc)
    print('Created directory for:', img_loc)


Checking for /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files


FileNotFoundError: [Errno 2] No such file or directory: '/Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files'

In [None]:
metadata_hotspot = os.path.join('..', parent_directory, project_dir, metadata_dir)
print(metadata_hotspot)

metadata_file_list = glob.glob(metadata_hotspot + '/*.json')
print(metadata_file_list)


../Jordan/cp/item-metadata
['../Jordan/cp/item-metadata/item-metadata-cph.3f05168.json', '../Jordan/cp/item-metadata/item-metadata-highsm.36052.json', '../Jordan/cp/item-metadata/item-metadata-highsm.60215.json', '../Jordan/cp/item-metadata/item-metadata-cph.3b43255.json', '../Jordan/cp/item-metadata/item-metadata-mrg.00788.json', '../Jordan/cp/item-metadata/item-metadata-ds.06560.json', '../Jordan/cp/item-metadata/item-metadata-highsm.51772.json', '../Jordan/cp/item-metadata/item-metadata-highsm.49335.json', '../Jordan/cp/item-metadata/item-metadata-highsm.20216.json', '../Jordan/cp/item-metadata/item-metadata-ppmsca.15375.json', '../Jordan/cp/item-metadata/item-metadata-highsm.18402.json', '../Jordan/cp/item-metadata/item-metadata-mrg.00785.json', '../Jordan/cp/item-metadata/item-metadata-highsm.04362.json', '../Jordan/cp/item-metadata/item-metadata-det.4a23603.json', '../Jordan/cp/item-metadata/item-metadata-highsm.29207.json', '../Jordan/cp/item-metadata/item-metadata-hhh.ok0012.sh

In [None]:
item_img_url = list()
count = 0

for file in metadata_file_list:
    with open(file, 'r', encoding='utf-8') as json_file:
        metadata = json.load(json_file)
        img_url_id = len(metadata['image_url'])
        img_url = metadata['image_url'][-1]
        item_img_url.append(img_url)
        count += 1
print(f'Identified {str(count)} image URLs')

Identified 59 image URLs


In [None]:
item_img_url

['https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05168v.jpg#h=1024&w=708',
 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:36000:36052/full/pct:25/0/default.jpg#h=1448&w=2172',
 'https://tile.loc.gov/storage-services/service/pnp/highsm/60200/60215v.jpg#h=741&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/cph/3b40000/3b43000/3b43200/3b43255r.jpg#h=422&w=640',
 'https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00788v.jpg#h=699&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/ds/06500/06560v.jpg#h=727&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/highsm/51700/51772v.jpg#h=570&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/highsm/49300/49335v.jpg#h=618&w=1024',
 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:20200:20216/full/pct:25/0/default.jpg#h=772&w=1551',
 'https://tile.loc.gov/storage-services/service/pnp/ppmsca/15300/15375v.jpg#h=1024&w=823',
 'https://tile.

In [None]:
cp_set_list_with_imgs = list()

for item in metadata_file_list:
    with open(item, 'r', encoding='utf-8') as item_data:
        item_metadata = json.load(item_data)

        item_metadata_dict = dict()
        item_metadata_dict['item_URI'] = item_metadata['id']
        try:
            item_metadata_dict['lccn'] = item_metadata['library_of_congress_control_number']
        except:
            item_metadata_dict['lccn'] = None
        item_metadata_dict['title'] = item_metadata['title']
        item_metadata_dict['image_URL_large'] = item_metadata['image_url'][-1]

        cp_set_list_with_imgs.append(item_metadata_dict)
print(cp_set_list_with_imgs[0])

{'item_URI': 'http://www.loc.gov/item/98508385/', 'lccn': '98508385', 'title': 'Curb service 10,000 current books - convenient, free, time saving : Chicago Public Library, Randolph St. corridor.', 'image_URL_large': 'https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05168v.jpg#h=1024&w=708'}


In [None]:
item_count = 0
error_count = 0
file_count = 0

img_file_prefix = 'img_'

for item in cp_set_list_with_imgs:
    img_url = item['image_URL_large']
    short_id = item['item_URI'].split('/')[-2]
    print('... fetching', img_url)
    item_count += 1

    r = requests.get(img_url)
    if r.status_code == 200:
        img_out = os.path.join(img_loc,str(img_file_prefix + short_id + '.jpg'))
        with open(img_out, 'wb') as img_file:
            img_file.write(r.content)
            print('Saved', img_out)
            file_count += 1

print('--- mini LOG ---')
print('files requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)


... fetching https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05168v.jpg#h=1024&w=708
Saved /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files/img_98508385.jpg
... fetching https://tile.loc.gov/image-services/iiif/service:pnp:highsm:36000:36052/full/pct:25/0/default.jpg#h=1448&w=2172
Saved /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files/img_2017686535.jpg
... fetching https://tile.loc.gov/storage-services/service/pnp/highsm/60200/60215v.jpg#h=741&w=1024
Saved /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files/img_2020723718.jpg
... fetching https://tile.loc.gov/storage-services/service/pnp/cph/3b40000/3b43000/3b43200/3b43255r.jpg#h=422&w=640
Saved /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files/img_89710983.jpg
... fetching https://tile.loc.gov/storage-services/service/pnp/mrg/00700/00788v.jpg#h=699&w=1024
Saved /Users/jrhym/Desktop/si676-2024-data/data/Jordan/cp/img-files/img_2017702902.jpg
... fetching h