# Trialing the OCLC Worldcat API

Notes on this process in my OneNote for now

In [None]:
import sys
if "../" not in sys.path:
    sys.path.append("../")
import os
import io
import json
import sys
import collections
from urllib.parse import quote
from dotenv import load_dotenv
from pymarc import marcxml, Record, Field, MARC8ToUnicode
from lxml import etree as ET

import requests
from bs4 import BeautifulSoup
# import xml.etree.ElementTree as ET

In [None]:
load_dotenv()
client_id = os.environ["CLIENT_ID"]
client_secret = os.environ["CLIENT_SECRET"]

## Search API v1

Can use WSKey Lite authentication but return values from search are atom/rss so not as easily parseable

In [None]:
AT_BL = False

if AT_BL:  # check if BL ip address, if not don't need proxy
    kwargs = {"proxies": {"http": "http://ad%5CHLLOYD:SOUTHSPINE16@bspcache.bl.uk:8080"}}

In [None]:
v1_search_url = "http://www.worldcat.org/webservices/"
v1_sru_endpoint = "catalog/search/sru?q="
v1_search_endpoint = "catalog/search/opensearch?q="
v1_oclc_num_endpoint = "catalog/content/"

wskey = f"&wskey={client_id}"
v1_headers = {"accept":"application/json"}
if '194.66.2' in requests.get("https://jsonip.com/").json()['ip']:  # check if BL ip address, if not don't need proxy
    kwargs = {"proxies": {"http": "http://ad%5CHLLOYD:SOUTHSPINE16@bspcache.bl.uk:8080"}}
else:
    kwargs = {}

In [None]:
query = 'ti="FENG LING DU"+AND+au="DUANMU (Hongliang)"'
sru_query = 'srw.ti="FENG"'#+AND+au="Duanmu (Hongliang)"'

In [None]:
v1_query_url = f"{v1_search_url}{v1_search_endpoint}{quote(query)}&format=rss&wskey={client_id}"
v1_query_url

In [None]:
sru_query_url = f"{v1_search_url}{v1_sru_endpoint}{quote(sru_query)}&format=rss&wskey={client_id}"
sru_query_url

In [None]:
sru_req = requests.get(sru_query_url, headers=v1_headers, **kwargs)
print(sru_req.text)

In [None]:
with open("sru_rss_xml.xml", "w") as f:
    f.write(sru_req.text)

In [None]:
req = requests.get(v1_query_url, headers=v1_headers, **kwargs)
req.text

In [None]:
search_results = BeautifulSoup(req.content, features="xml").findAll('item')

In [None]:
len(search_results)

In [None]:
records = []
for sr in search_results:
    oclc_num = sr.find("oclcterms:recordIdentifier").text
    v1_oclc_url = f"{v1_search_url}{v1_oclc_num_endpoint}{oclc_num}?wskey={client_id}"
    record = requests.get(v1_oclc_url, headers=v1_headers, **kwargs).text
    marc_record = marcxml.parse_xml_to_array(io.StringIO(record))[0]
    records.append(marc_record)

In [None]:
records

In [None]:
[r.get_fields("001")[0].data for r in records]

## Metadata API

Have to use access token authorisation but return values can be xml

In [None]:
scope = "WorldCatMetadataAPI"
auth_url = f"https://oauth.oclc.org/token?grant_type=client_credentials&scope={scope}"
bib_headers = {"Accept":"application/json"}
oclc_headers = {"Accept":"application/marcxml+xml"}

In [None]:
auth = requests.post(auth_url, headers=bib_headers, auth=(client_id, client_secret))
token = auth.json()["access_token"]
bib_headers["Authorization"] = f"Bearer {token}"
oclc_headers["Authorization"] = f"Bearer {token}"

bib_headers, oclc_headers

In [None]:
auth.json()

In [None]:
query = 'ti="Feng Ling Du" AND au="Duanmu (Hongliang)"'

In [None]:
metadata_url = "https://metadata.api.oclc.org/worldcat/"
search_brief_bibs = "search/brief-bibs?q="
get_record_by_oclc_num = "manage/bibs/"

ti_au_search_url = metadata_url + search_brief_bibs + quote(query) + "&limit=20"
oclc_search_stem = metadata_url + get_record_by_oclc_num

In [None]:
ti_au_search_url

In [None]:
brief_bib_search_result = requests.get(ti_au_search_url, headers=bib_headers)

In [None]:
brief_bib_search_result.json()

In [None]:
brief_bib_search_result.json()["briefRecords"][0]

In [None]:
oclc_search_stem + brief_bib_search_result.json()["briefRecords"][0]["oclcNumber"]

In [None]:
marc_xml = {
    int(x["oclcNumber"]): requests.get(oclc_search_stem + x["oclcNumber"], headers=oclc_headers)
    for x in brief_bib_search_result.json()["briefRecords"]
}

In [None]:
[x["oclcNumber"] for x in brief_bib_search_result.json()["briefRecords"]]

In [None]:
print(marc_xml[23921305].text)

In [None]:
print(marcxml.parse_xml_to_array(io.StringIO(marc_xml[23921305].text))[0])

### Diagnostic functions

In [None]:
def pretty_print_POST(req):
    """
    At this point it is completely built and ready
    to be fired; it is "prepared".

    However pay attention at the formatting used in 
    this function because it is programmed to be pretty 
    printed and may differ from the actual request.
    """
    print('{}\r\n{}\r\n\r\n{}'.format(
        req.method + ' ' + req.url,
        '\r\n'.join('{}: {}'.format(k, v) for k, v in req.headers.items()),
        req.body,
    ))

In [None]:
lang_xml = requests.get("https://www.loc.gov/standards/codelists/languages.xml")

In [None]:
lang_xml.text

In [None]:
tree = ET.fromstring(lang_xml.text)