# Scraping vs Using an API

This is a demo of two ways of getting Trade Card images from the Harvard Digital Collections. One, by scraping, and the other by using a purpose-built API called LibraryCloud.

## Scraping HOLLIS Images (Selenium needed for Javascript)

This is relatively involved, since to get the image URL, we need to emulate a human browsing and run javascript on the browser locally to view the HOLLIS Images pages.

In [1]:
import json
import time

import requests
from PIL import Image
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


In [None]:
#login_url = "http://pdharvard.egreenapple.com.ezp-prod1.hul.harvard.edu/index2.html"
login_url = "https://www.pin1.harvard.edu/cas/login?service=https%3A%2F%2Fhollis.harvard.edu%3A443%2Fprimo_library%2Flibweb%2FcasRedirect%3FauthenticationProfile%3DHarvard+Primo%26targetURL%3Dhttps%253A%252F%252Fhollis.harvard.edu%252Fprimo-explore%252Fsearch%253Fvid%253DHVD2%2526sortby%253Drank%2526lang%253Den_US%2526from-new-ui%253D1%2526authenticationProfile%253DHarvard%252BPrimo%26institution%3D01HVD"
base_url = 'https://images.hollis.harvard.edu/primo-explore/search?query=any,contains,trade%20cards&tab=default_tab&search_scope=default_scope&vid=HVD_IMAGES&lang=en_US&offset=0#searchResultList'
driver = webdriver.Chrome(executable_path=r'../../../Scripts/chromedriver')


### Run the next two cells only if working off-campus

In [None]:
# off-campus setup
driver.get(login_url)

### Log in before running next cell

In [None]:
# Start a session in requests
s = requests.session()

# Set cookies in requests session from selenium session
for cookie in driver.get_cookies():
    c = {cookie['name']: cookie['value']}
    s.cookies.update(c)

# Show the new session cookies
# s.cookies.get_dict()

# Set session headers to look like Selenium session and add the referer url
user_agent = driver.execute_script('return navigator.userAgent;')
headers = {'User-Agent': user_agent, 'Referer': login_url}
s.headers.update(headers)

# Show the new headers
# s.headers


In [None]:
# driver.switch_to.window(original_window)
driver.get(base_url)

In [None]:
# set starting point | e.g. if process is interrupted for some reason
starting_page = 1
starting_image = 0 # 0-based count

links = []
problems = []
metadata = []
page_no = starting_page
original_window = driver.current_window_handle
driver.implicitly_wait(10)
wait = WebDriverWait(driver, 30)

# need to go to starting page if it's not the first
if starting_page > 1:
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f'button[aria-label="Go to page {starting_page}"]')))
    goto_page = driver.find_elements(By.CSS_SELECTOR, f'button[aria-label="Go to page {starting_page}"]')
    goto_page[0].click()
    time.sleep(10)

while page_no <= 29:  # noqa: PLR2004
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.thumbnail')))
    image_elements = driver.find_elements(By.CSS_SELECTOR, 'a.thumbnail')
    no_thumbs = len(image_elements)

    for idx in range(starting_image, no_thumbs):
        try:
            popup = False

            if EC.staleness_of(image_elements[idx]):
                image_elements = driver.find_elements(By.CSS_SELECTOR, 'a.thumbnail')

            try:
                wait.until(EC.element_to_be_clickable(image_elements[idx]))
                image_elements[idx].click()

            except ElementClickInterceptedException:
                driver.switch_to.default_content()
                dialogue = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'prm-brief-result-container')))
                dialogue.click()
                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
                image_elements[idx].click()

            except:
                driver.get(base_url)
                time.sleep(10)
                image_elements = driver.find_elements(By.CSS_SELECTOR, 'a.thumbnail')
                image_elements[idx].click()

            try:
                test = image_elements[idx].find_elements(By.CSS_SELECTOR, '.nophoto')
                has_photo = not len(test)
            except:
                has_photo = True

            wait.until(EC.presence_of_element_located((By.TAG_NAME, 'md-dialog-content')))

            detail_rows = driver.find_elements(By.CSS_SELECTOR, '#details div.spaced-rows div[layout="row"')
            data = {}
            for row in detail_rows:
                title = row.find_elements(By.CSS_SELECTOR, 'span[data-details-label]')[0].text
                valueList = row.find_elements(By.CSS_SELECTOR, 'div[role="listitem"]')
                values = [v.text for v in valueList]
                if len(values):
                    data[title] = values if len(values) > 1 else values[0]

            metadata.append(data)

            if has_photo:
                link_list = []
                target = wait.until(EC.any_of(
                    EC.presence_of_element_located((By.ID, 'iframe')),
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'md-grid-list a.thumbnail'))
                ))

                if target.tag_name == 'iframe':
                    driver.switch_to.frame('iframe')
                    img_count = 1
                else:
                    popup = True
                    images = driver.find_elements(By.CSS_SELECTOR, 'md-grid-list a.thumbnail')

                    try:
                        wait.until(EC.element_to_be_clickable(images[0]))
                        images[0].click()
                    except:
                        images[1].click()

                    wait.until(EC.number_of_windows_to_be(2))

                    for window_handle in driver.window_handles:
                        if window_handle != original_window:
                            driver.switch_to.window(window_handle)
                            break

                    img_count = int(driver.find_element(By.CSS_SELECTOR, '.imageNav label.label').text[4:6])
                    next_button = driver.find_element(By.CSS_SELECTOR, '.imageNav button[aria-label*="Next"]')
                    wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, 'iframe')))


                count = 0
                while count < img_count:
                    count = count + 1
                    try:
                        try:
                            button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.mirador-icon-download a')))
                            button.click()
                            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li[title*="Download IIIF"] a')))
                        except:
                            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li[title*="Download IIIF"] a')))

                    except Exception as e:
                        problems.append(f'Error A: Page {page_no}, image {idx + 1}: {e.msg}\n{e.stacktrace}')
                        print(f'Error: Page {page_no}, image {idx + 1}: {e.msg[0:50]}')

                    else:
                        manifest = driver.find_elements(By.CSS_SELECTOR, 'li[title*="Download IIIF"] a')
                        link_list.append(manifest[0].get_attribute('href'))

                    if img_count > 1:
                        driver.switch_to.default_content()
                        next_button.click()
                        time.sleep(5)
                        wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, 'iframe')))

                if popup:
                    driver.close();
                    driver.switch_to.window(original_window)

                driver.switch_to.default_content()
                dialogue = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'prm-brief-result-container')))
                dialogue.click()
                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

                if len(link_list):
                    links.append(link_list)

            else:
                driver.switch_to.default_content()
                dialogue = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'prm-brief-result-container')))
                dialogue.click()
                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

        except Exception as e:
            if len(driver.window_handles) > 1:
                for window_handle in driver.window_handles:
                    if window_handle != original_window:
                        driver.switch_to.window(window_handle)
                        driver.close();

                driver.switch_to.window(original_window)

            driver.switch_to.default_content()
            webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
            problems.append(f'Error B: Page {page_no}, image {idx + 1}: {e.msg}\n{e.stacktrace}')
            print(f'Error: Page {page_no}, image {idx + 1}: {e.msg[0:50]}')

    print(f'Page {page_no} completed.')

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Next page"]')))
    next_page = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label="Next page"]')
    next_page[0].click()
    time.sleep(5)
    starting_image = 0 # reset starting image for pages going forwards
    page_no = page_no + 1


In [None]:
with open('links.json', 'w') as file:
    file.write(json.dumps(links))

with open('metadata.json', 'w') as file:
    file.write(json.dumps(metadata))

with open('problems.txt', 'w') as file:
    file.write(json.dumps(problems))


In [None]:
for p in problems:
    print(p)

In [None]:
for img, lset in enumerate(links):
    for idx, link in enumerate(lset):
        img_id = link[link.rfind(':') + 1:]
        response = s.get(f'https://ids.lib.harvard.edu/ids/iiif/{img_id}/full/full/0/default.jpg')
        with open(f'images/{img}-{idx}-img.jpg', 'wb') as outfile:
            outfile.write(response.content)


## Using LibraryCloud API

All of the above can be reduced to the next two code blocks here, which is broken down and explained below. Using a built in library in python, xml, to parse the return of the call from the API (can also get the return in JSON and use that to parse)

In [2]:
import requests
import time
from xml.etree import ElementTree as ET

# we need these to parse the XML that is returned by the API
namespaces = {
    'mods': "http://www.loc.gov/mods/v3",
    'default': "http://api.lib.harvard.edu/v2/item"
}

In [5]:
limit = '1' # the number of hits per page to our search, can be up to 250
next_cursor = "*" # needs to be * to start
BASE_URL = f"https://api.lib.harvard.edu/v2/items?genre=trade+cards&limit={limit}&cursor="

# keep getting pages while there are more
while(next_cursor):
  api_call = requests.get(f'{BASE_URL}{next_cursor}').text
  root = ET.fromstring(api_call)
  urls = root.findall('.//mods:url[@displayLabel="Full Image"]', namespaces)

  for url in urls:
    time.sleep(1) # so we don't download too fast
    img_data = requests.get(url.text).content
    # here we can properly add the .jpg extension for each file
    # as we save (we do need to create images directory first)
    with open(f'../images/{url.text.rsplit("/",1)[-1]}.jpg', 'wb') as f:
      f.write(img_data)

  next_cursor = root.find('.//default:nextCursor', namespaces).text
  """ can also "hard code" it with: root[0][1].text
   if there is no next page, this will return None
   and the while loop will end """

KeyboardInterrupt: 

### API step-by-step (for one sample results page only)

We can follow the instructions on the [Harvard Library wiki](https://wiki.harvard.edu/confluence/display/LibraryStaffDoc/LibraryCloud+APIs) to form our requests to the LibraryCloud Items API. Since we are not looking at multiple pages, we don't need the cursor argument

In [None]:
limit = '10'
BASE_URL = f"https://api.lib.harvard.edu/v2/items?genre=trade+cards&limit={limit}"

api_call = requests.get(BASE_URL).text

Parse the text from the requests call into an ElementTree, so we can search it to find the URLs we want

In [None]:
root = ET.fromstring(api_call)

We are using an X-path search to find all the URLs with the attribute displayLabel that equals Full Image (this is the way full-sized image URLs are coded in the mods XML we get back from LibraryCloud)

In [None]:
urls = root.findall('.//mods:url[@displayLabel="Full Image"]', namespaces)

We can then use the collected URLs to save each as a jpg file in the images folder.

In [None]:
for url in urls:
  time.sleep(1) # so we don't download too fast
  img_data = requests.get(url.text).content
  # here we can properly add the .jpg extension for each file
  # as we save (we do need to create images directory first)
  with open(f'images/{url.text.rsplit("/",1)[-1]}.jpg', 'wb') as f:
    f.write(img_data)

If we'd rather just collect the links themselves into a list, we could do that and then download them using something like wget. Here is how you would do that.

In [None]:
links = [url.text for url in urls]

out_file = ""
for link in links:
  out_file = out_file + link + '\n'

with open("links.txt", "w") as file:
  file.write(out_file)

We can now use wget from the command line (! in Google Colab) to download all of the links at once.

If you want the names to be the NRS links, like urn-3:HBS.Baker.GEN:10733692, then don't use the --trust-server-names flag. If you want the files to have the digital repository image numbers, then use the flag

In [None]:
!wget -i links.txt # --trust-server-names

--2024-03-06 17:28:15--  https://nrs.harvard.edu/urn-3:HBS.Baker.GEN:10733688
Resolving nrs.harvard.edu (nrs.harvard.edu)... 23.23.150.153, 54.80.243.171, 44.212.254.238
Connecting to nrs.harvard.edu (nrs.harvard.edu)|23.23.150.153|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://ids.lib.harvard.edu/ids/view/46222229 [following]
--2024-03-06 17:28:16--  https://ids.lib.harvard.edu/ids/view/46222229
Resolving ids.lib.harvard.edu (ids.lib.harvard.edu)... 44.205.94.22, 52.72.136.172, 44.216.212.189
Connecting to ids.lib.harvard.edu (ids.lib.harvard.edu)|44.205.94.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [image/jpeg]
Saving to: ‘urn-3:HBS.Baker.GEN:10733688’

urn-3:HBS.Baker.GEN     [   <=>              ] 648.38K   713KB/s    in 0.9s    

2024-03-06 17:28:18 (713 KB/s) - ‘urn-3:HBS.Baker.GEN:10733688’ saved [663942]

--2024-03-06 17:28:18--  https://nrs.harvard.edu/urn-3:HBS.Baker.GEN:10733691
Connect