##### Use the BookOps WorldCat wrapper to pull LC classification data based on a list of OCLC numbers

##### Import libraries
This section loads the packages needed to work with data, send API requests, and use the BookOps wrapper with OCLC's WorldCat API.

In [None]:
import pandas as pd
import requests
from bookops_worldcat import WorldcatAccessToken
import time
import re

##### Configure access token
This section contains the authentication details required by the WorldCat API. 'mykey' and 'mysecret' should be updated based on the user's credentials.

In [None]:
#Configure access token
WORLDCAT_KEY = 'mykey'
WORLDCAT_SECRET = 'mysecret'
SCOPES = 'WorldCatMetadataAPI'

##### Configure files
This section contains the filepath and name of the file that will be read (INPUT_FILE) and the file where results will be saved (OUTPUT_FILE).

In [None]:
#Configure files
INPUT_FILE = 'FILENAME'
OUTPUT_FILE = 'FILENAME.xlsx'

##### Generate an access token
The **get_token** function uses the API credentials specified above to create a token to access the WorldCat API. Tokens expire after twenty minutes, and should automatically refresh within the script.

In [None]:
#Generate an access token
def get_token():
    return WorldcatAccessToken(
        key=WORLDCAT_KEY,
        secret=WORLDCAT_SECRET,
        scopes=SCOPES
    )

##### Get LC data

The **get_classification_bibs** function takes an OCLC number from INPUT_FILE, queries the WorldCat API, and returns the bib record associated with the OCLC number.

In [None]:
#Get LC data
def get_classification_bibs(oclc_number, token):
    try:
        url = f'https://metadata.api.oclc.org/worldcat/search/classification-bibs/{oclc_number}'
        headers = {
            'Authorization': f'Bearer {token.token_str}',
            'Accept': 'application/json'
        }
        response = requests.get(url, headers=headers)

        # Raise error if request failed
        response.raise_for_status()
        return response.json()

    except requests.RequestException as e:
        print(f"[ERROR] Failed to fetch data for OCLC {oclc_number}: {e}")
        return {}

##### Clean the resulting data

The **clean_lc_data** function cleans the returned data by stripping leading and trailing spaces, converting text to uppercase, removing unwanted characters and trailing punctuation, and standardizing the format.

In [None]:
#Clean LC data to remove apostrophes/brackets, normalize spacing, and remove trailing punctuation
def clean_lc_data(lc_value):
    if not lc_value or lc_value == "None":
        return None
    lc_value = lc_value.strip().upper()
    lc_value = re.sub(r"[\'\[\]]", '', lc_value)
    lc_value = re.sub(r'\s+', ' ', lc_value)
    lc_value = re.sub(r'[.,;]+$', '', lc_value)
    if re.match(r'^[A-Z]{1,3}\d+', lc_value):
        return lc_value
    return None

##### Run the workflow

The **main** function performs the following steps:
1. Reads INPUT_FILE
2. Creates an API token
3. Runs through each OCLC number and sends a query to get the most frequently used LC classification
4. Cleans the resulting data
5. Collects the results of the queries and merges them back with the original data
6. If RECORD_ID does not exist in INPUT_FILE, adds a RECORD_ID column with a unique ID for each record
7. Exports the final dataset as an Excel file

Depending on the structure of INPUT_FILE, names of fields may need to be updated. For example, the file structure here uses "OCLC_NUMBER" as a field name. A different file may use "network_number" instead, which means either the script below needs to be updated to use "network_number", or INPUT_FILE needs to be updated to use "OCLC_NUMBER".


In [None]:
#Run query and export results
def main():
    oclclist_df = pd.read_excel(INPUT_FILE, dtype={'ISBN': str, 'OCLC_NUMBER': str})
    subjects_data = []

    token = get_token()

    for oclc in oclclist_df['OCLC_NUMBER']:
        oclc = str(oclc).strip()
        if not oclc:
            continue

        if token.is_expired():
            print("Refreshing token!")
            token = get_token()

        result = get_classification_bibs(oclc, token)

        lc_data = result.get('lc', {}).get('mostPopular', 'None')
        subjects_data.append({'OCLC_NUMBER': oclc, 'LC': lc_data})
        print(f"{oclc}, {lc_data}")
        time.sleep(0.2)

    subjects_df = pd.DataFrame(subjects_data)

    final_df = oclclist_df.merge(subjects_df, on='OCLC_NUMBER', how='left')
    if 'RECORD_ID' not in final_df.columns:
        final_df.insert(0, 'RECORD_ID', range(1, len(final_df) + 1))
    final_df.to_excel(OUTPUT_FILE, index=False)
    print(f"Data exported to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()
    