In [1]:
import csv
import json
import requests
import os
from os.path import join
import glob

In [34]:
url = "https://www.loc.gov/free-to-use"
page = "fish-and-fishing"
parameters = {"fo" : "json"}

In [35]:
cl = requests.get(url + "/" + page, params=parameters)

cl.url

'https://www.loc.gov/free-to-use/fish-and-fishing?fo=json'

In [36]:
cl_json = cl.json()
cl_json.keys()

dict_keys(['breadcrumbs', 'content', 'content_is_post', 'description', 'disable_max_line_length', 'expert_resources', 'manifest', 'next', 'next_sibling', 'options', 'pages', 'portal', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type'])

In [37]:
print(cl_json["content"].keys())

dict_keys(['active', 'description', 'disable_navigation', 'image_url', 'link', 'markup', 'meta_description', 'pagination', 'partof', 'set', 'slug', 'status', 'title', 'url'])


In [38]:
print(cl_json["content"]["set"].keys())

dict_keys(['items', 'layout'])


In [39]:
print(cl_json["content"]["set"]["items"])

[{'alt': 'Smiling  woman sitting on the ground with a fishing pole and her bare feet dangling over the water.', 'image': '/static/portals/free-to-use/public-domain/fish-and-fishing/fishing-1.jpg', 'link': '/resource/ppmsca.01947/', 'title': 'African American woman fishing at the Tidal Basin, Washington, D.C. Photo by Toni Frissell, 1957.'}, {'alt': 'Pairs of men in several canoes that face into the water rapids.', 'image': '/static/portals/free-to-use/public-domain/fish-and-fishing/fishing-2.jpg', 'link': '/resource/det.4a31696/', 'title': 'Native Americans fishing at the "Soo," St. Mary\'s Rapids, Sault Ste. Marie. Color print, 1901.\n'}, {'alt': 'Boy leaning over water fishing with a home-made fishing pole.', 'image': '/static/portals/free-to-use/public-domain/fish-and-fishing/fishing-3.jpg', 'link': '/resource/fsa.8e03362/', 'title': 'Boy fishing. Photo, 1936.'}, {'alt': 'Naked men dive off single-mast sail boats (several with American flags) and retrieve sponges from the sea floor.

In [40]:
cl_json["content"]["set"]["items"][0].keys()

dict_keys(['alt', 'image', 'link', 'title'])

In [41]:
len(cl_json['content']['set']['items'])

50

Write CSV File

In [45]:
cl_set_list = "../Jordan/cp2/collection2_set_list.csv"
headers = ['image', 'link', 'title']

with open(cl_set_list, mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    for item in cl_json['content']['set']['items']:
        writer.writerow([item['image'], item['link'], item['title']])

Harvesting Metadata

In [43]:
def regenerate_cl_csv(collection2_set_list_csv):
    cl_items = list()
    with open(collection2_set_list_csv, mode='r') as file:
        content = csv.DictReader(file)
        for row in content:
            row_dict = dict()
            for field in content.fieldnames:
                row_dict[field] = row[field]
            cl_items.append(row_dict)
        return cl_items

In [46]:
collection2_set_list_csv = "../Jordan/cp2/collection2_set_list.csv"
cl_set_list = regenerate_cl_csv(collection2_set_list_csv)

In [47]:
cl_set_list[0]

{'image': '/static/portals/free-to-use/public-domain/fish-and-fishing/fishing-1.jpg',
 'link': '/resource/ppmsca.01947/',
 'title': 'African American woman fishing at the Tidal Basin, Washington, D.C. Photo by Toni Frissell, 1957.'}

Harvest Individual Item Metadata

In [48]:
baseURL= "https://www.loc.gov"
parameters = {"fo" : "json"}

In [49]:
# run this cell to confirm that you have a location for the JSON files
item_metadata_directory = os.path.join('..','Jordan','cp2','item-metadata')

if os.path.isdir(item_metadata_directory):
    print(item_metadata_directory,'exists')
else:
    os.mkdir(item_metadata_directory)
    print('created',item_metadata_directory)

../Jordan/cp2/item-metadata exists


In [50]:
len(cl_set_list)

50

In [51]:
item_count = 0
error_count = 0
file_count = 0

parent_directory = 'Jordan'
data_directory = 'cp2'
item_metadata_directory = 'item-metadata'
item_metadata_file_prefix = 'item-metadata'
json_suffix = '.json'

for item in cl_set_list:
    if item['link'] == 'link':
        continue
    if '?' in item ['link']:
        item_id = item['link']
        short_id = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + item_id, params=parameters)
        print('requested', item_metadata.url, item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested', item_metadata.url, item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print ('json not found')
            continue
        file_out = os.path.join('..', parent_directory, data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_id + json_suffix))
        with open(file_out, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', file_out)
        item_count += 1
    else:
        item_id = item['link']
        short_id = item['link'].split('/')[2]
        item_metadata = requests.get(baseURL + item_id, params=parameters)
        print('requested', item_metadata.url, item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested', item_metadata.url, item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print ('json not found')
            continue
        file_out = os.path.join('..', parent_directory, data_directory, item_metadata_directory, str(item_metadata_file_prefix + '-' + short_id + json_suffix))
        with open(file_out, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', file_out)
        item_count += 1


print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/ppmsca.01947/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-ppmsca.01947.json
requested https://www.loc.gov/resource/det.4a31696/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-det.4a31696.json
requested https://www.loc.gov/resource/fsa.8e03362/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-fsa.8e03362.json
requested https://www.loc.gov/resource/pga.01210/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-pga.01210.json
requested https://www.loc.gov/resource/jpd.00505/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-jpd.00505.json
requested https://www.loc.gov/resource/mrg.07416/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-mrg.07416.json
requested https://www.loc.gov/resource/ppmsca.41730/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-metadata-ppmsca.41730.json
requested https://www.loc.gov/resource/hec.07254/?fo=json 200
wrote ../Jordan/cp2/item-metadata/item-meta

Harvest Individual Item Images

In [52]:
key_dir = os.path.join('/','Users','jrhym','Desktop')
parent_directory = 'Jordan'
project_dir = 'cp2'
image_dir = 'img-files'
metadata_dir = 'item-metadata'

img_loc = os.path.join(key_dir, parent_directory, project_dir, image_dir)
print('Checking for', img_loc)

if os.path.isdir(img_loc):
    print('Image directory exists')
else:
    os.mkdir(img_loc)
    print('Created directory for:', img_loc)

Checking for /Users/jrhym/Desktop/Jordan/cp2/img-files
Image directory exists


In [53]:
metadata_hotspot = os.path.join('..', parent_directory, project_dir, metadata_dir)
print(metadata_hotspot)

metadata_file_list = glob.glob(metadata_hotspot + '/*.json')
print(metadata_file_list)

../Jordan/cp2/item-metadata
['../Jordan/cp2/item-metadata/item-metadata-highsm.16470.json', '../Jordan/cp2/item-metadata/item-metadata-highsm.46917.json', '../Jordan/cp2/item-metadata/item-metadata-afc1991022.afc1991022_hh_002.json', '../Jordan/cp2/item-metadata/item-metadata-ggbain.10041.json', '../Jordan/cp2/item-metadata/item-metadata-thc.5a36385.json', '../Jordan/cp2/item-metadata/item-metadata-nclc.00972.json', '../Jordan/cp2/item-metadata/item-metadata-hec.40272.json', '../Jordan/cp2/item-metadata/item-metadata-det.4a31696.json', '../Jordan/cp2/item-metadata/item-metadata-cph.3b03723.json', '../Jordan/cp2/item-metadata/item-metadata-ppmsca.58753.json', '../Jordan/cp2/item-metadata/item-metadata-hec.07254.json', '../Jordan/cp2/item-metadata/item-metadata-var.0092.json', '../Jordan/cp2/item-metadata/item-metadata-cph.3b03958.json', '../Jordan/cp2/item-metadata/item-metadata-gdclccn.ca06001657v20n1.json', '../Jordan/cp2/item-metadata/item-metadata-ppmsca.01947.json', '../Jordan/cp2/

In [54]:
item_img_url = list()
count = 0

for file in metadata_file_list:
    with open(file, 'r', encoding='utf-8') as json_file:
        metadata = json.load(json_file)
        img_url_id = len(metadata['image_url'])
        img_url = metadata['image_url'][-1]
        item_img_url.append(img_url)
        count += 1
print(f'Identified {str(count)} image URLs')

Identified 50 image URLs


In [55]:
item_img_url

['https://tile.loc.gov/image-services/iiif/service:pnp:highsm:16400:16470/full/pct:25/0/default.jpg#h=1207&w=1537',
 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:46900:46917/full/pct:25/0/default.jpg#h=2172&w=1448',
 'https://tile.loc.gov/image-services/iiif/service:afc:afc1991022:afc1991022_hh_002:afc1991022_hh_002_02/full/pct:100/0/default.jpg#h=5047&w=3481',
 'https://tile.loc.gov/storage-services/service/pnp/ggbain/10000/10041v.jpg#h=738&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/thc/5a36000/5a36300/5a36385r.jpg#h=420&w=537',
 'https://tile.loc.gov/storage-services/service/pnp/nclc/00900/00972v.jpg#h=606&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/hec/40200/40272v.jpg#h=793&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/det/4a30000/4a31000/4a31600/4a31696v.jpg#h=530&w=1024',
 'https://tile.loc.gov/storage-services/service/pnp/ds/15100/15146v.jpg#h=1024&w=649',
 'https://tile.loc.gov/storage-services/service/pnp/ppmsca

In [56]:
cp2_set_list_with_imgs = list()

for item in metadata_file_list:
    with open(item, 'r', encoding='utf-8') as item_data:
        item_metadata = json.load(item_data)

        item_metadata_dict = dict()
        item_metadata_dict['item_URI'] = item_metadata['id']
        try:
            item_metadata_dict['lccn'] = item_metadata['library_of_congress_control_number']
        except:
            item_metadata_dict['lccn'] = None
        item_metadata_dict['title'] = item_metadata['title']
        item_metadata_dict['image_URL_large'] = item_metadata['image_url'][-1]

        cp2_set_list_with_imgs.append(item_metadata_dict)
print(cp2_set_list_with_imgs[0])

{'item_URI': 'http://www.loc.gov/item/2011634663/', 'lccn': '2011634663', 'title': '"Fish thrower" at the Pike Place Fish Company, Seattle, Washington', 'image_URL_large': 'https://tile.loc.gov/image-services/iiif/service:pnp:highsm:16400:16470/full/pct:25/0/default.jpg#h=1207&w=1537'}


In [57]:
item_count = 0
error_count = 0
file_count = 0

img_file_prefix = 'img_'

for item in cp2_set_list_with_imgs:
    img_url = item['image_URL_large']
    short_id = item['item_URI'].split('/')[-2]
    print('... fetching', img_url)
    item_count += 1

    r = requests.get(img_url)
    if r.status_code == 200:
        img_out = os.path.join(img_loc,str(img_file_prefix + short_id + '.jpg'))
        with open(img_out, 'wb') as img_file:
            img_file.write(r.content)
            print('Saved', img_out)
            file_count += 1

print('--- mini LOG ---')
print('files requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)


... fetching https://tile.loc.gov/image-services/iiif/service:pnp:highsm:16400:16470/full/pct:25/0/default.jpg#h=1207&w=1537
Saved /Users/jrhym/Desktop/Jordan/cp2/img-files/img_2011634663.jpg
... fetching https://tile.loc.gov/image-services/iiif/service:pnp:highsm:46900:46917/full/pct:25/0/default.jpg#h=2172&w=1448
Saved /Users/jrhym/Desktop/Jordan/cp2/img-files/img_2017883639.jpg
... fetching https://tile.loc.gov/image-services/iiif/service:afc:afc1991022:afc1991022_hh_002:afc1991022_hh_002_02/full/pct:100/0/default.jpg#h=5047&w=3481
Saved /Users/jrhym/Desktop/Jordan/cp2/img-files/img_afc1991022_hh_002.jpg
... fetching https://tile.loc.gov/storage-services/service/pnp/ggbain/10000/10041v.jpg#h=738&w=1024
Saved /Users/jrhym/Desktop/Jordan/cp2/img-files/img_2014690025.jpg
... fetching https://tile.loc.gov/storage-services/service/pnp/thc/5a36000/5a36300/5a36385r.jpg#h=420&w=537
Saved /Users/jrhym/Desktop/Jordan/cp2/img-files/img_2019677606.jpg
... fetching https://tile.loc.gov/storage-s