# POST Processing POD Reports

This notebook filters POD reports based on institution-specific requirements such as data type preferences and recency. Because new uploads are added to POD daily, we use the BorrowDirect API to verify that each item is unique.

The primary goal of this notebook is to confirm that an item is **not** held by any BorrowDirect institution besides Penn. Only items with only Penn holdings are included in the final report.

**Note:** An additional filter excludes collections held at other institutions, even if their records appear in our ILS, when they are not held by Penn Libraries.

In [2]:
# load xlsx file into pandas dataframe
import pandas as pd
import numpy as np

# load xlsx file into pandas dataframe
df = pd.read_excel('unique_penn_text.xlsx')

In [3]:
# show columns
print(df.columns)

Index(['key', 'F001', 'F010_str', 'F245', 'normalized_title',
       'normalized_edition', 'normalized_pub', 'source', 'match_key',
       'F007_str', 'F020_str', 'F250_str', 'F260_str', 'id_list_str',
       'key_array_str', 'F007_code', 'F007_desc'],
      dtype='object')


In [4]:
# Ensure F001 is a string, then replace any occurrence ending with "03680" with "03681"
df['F001'] = df['F001'].astype(str).str.replace(r'03680$', '03681', regex=True)

print(df.head())

                                                 key              F001  \
0  8604 forrest avenue philadelphia pennsylvania ...  9978845258603681   
1                                      9789381005408  9978085185803681   
2                                      9788170565628  9977914437003681   
3                            9788126423415 paperback  9962328533503681   
4                                         8192611396  9978003905503681   

  F010_str                                               F245  \
0      NaN  8604 Forrest avenue, Philadelphia, Pennsylvani...   
1      NaN  880-01 Bhāratīya citrakalā meṃ Jaina citra...   
2      NaN  880-01 Kamaleśvara ke kathā-sāhitya meṃ ma...   
3      NaN  880-01 Mālguḍidinaṅṅaḷ / Ār. Ke. Nārāy...   
4      NaN  880-01 Mōhanasvāmi : kathāsaṅkalana / Vasu...   

                                    normalized_title       normalized_edition  \
0  8604 forrest avenue philadelphia pennsylvania ...                      NaN   
1 

In [5]:
# check if the key column is unique
print(df['match_key'].is_unique)

True


In [6]:
# are there any rows in match_key that are empty?
print(df['match_key'].isnull().values.any())

False


In [7]:
# count the number of rows with match_key empty
print(df['match_key'].isnull().sum())

0


In [None]:
# load hsp-removed-mmsid.txt into a list
with open('hsp-removed-mmsid.txt') as f:
    hsp_removed_mmsid = f.read().splitlines()

# filter out rows with MMSIDs that are not in this list
df = df[df['F001'].isin(hsp_removed_mmsid)]

In [42]:
# how many rows are left?
print(df.shape)

(4670, 17)


In [None]:
import time
import requests
import pandas as pd

def get_borrowdir_ids(match_key):
    url = f"https://borrowdirect.reshare.indexdata.com/api/v1/search?lookfor={match_key}"
    response = requests.get(url)
    data = response.json()
    # Collect all ids; use set() to ensure uniqueness if desired.
    ids = list(set(record['id'] for record in data.get('records', [])))
    time.sleep(1)  # Throttle the requests by sleeping for 1 second
    return ids

# Apply the function to your sample DataFrame.
df['borrowdir_id'] = df['match_key'].apply(get_borrowdir_ids)

print(df.head())

In [37]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.9 (from selenium)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def check_up_holdings_selenium(borrowdir_id, debug=False):
    url = f"https://borrowdirect.reshare.indexdata.com/Record/{borrowdir_id}/Holdings"
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    
    driver.get(url)
    # Wait for the dynamic content to load; adjust as necessary.
    time.sleep(3)
    
    if debug:
        print("Accessing URL:", url)
    
    try:
        # Locate the main tab content container
        tab_content = driver.find_element(By.CSS_SELECTOR, "div.tab-content")
        # Within tab_content, get the active holdings pane
        holdings_div = tab_content.find_element(By.CSS_SELECTOR, "div.tab-pane.holdings-tab.active")
        # Look for h3 elements within the holdings pane
        h3_tags = holdings_div.find_elements(By.TAG_NAME, "h3")
        institutions = set(tag.text.strip() for tag in h3_tags if tag.text.strip())
        if debug:
            print("Institutions found:", institutions)
        result = (institutions == {"University of Pennsylvania"})
    except Exception as e:
        if debug:
            print("Error encountered:", e)
        result = False
    driver.quit()
    return result

# Example debugging usage
print(check_up_holdings_selenium("YOUR_KNOWN_BORROWDIR_ID", debug=True))

# Continue processing your DataFrame.
df_exploded = df.explode('borrowdir_id').reset_index(drop=True)
df_exploded['up_holdings'] = df_exploded['borrowdir_id'].apply(lambda x: check_up_holdings_selenium(x, debug=True))
print("Any True values?", df_exploded['up_holdings'].any())

Accessing URL: https://borrowdirect.reshare.indexdata.com/Record/YOUR_KNOWN_BORROWDIR_ID/Holdings
Error encountered: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.tab-content"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100cce138 cxxbridge1$str$ptr + 3653888
1   chromedriver                        0x0000000100cc6988 cxxbridge1$str$ptr + 3623248
2   chromedriver                        0x000000010072c968 cxxbridge1$string$len + 89228
3   chromedriver                        0x0000000100770d4c cxxbridge1$string$len + 368752
4   chromedriver                        0x00000001007aa4f0 cxxbridge1$string$len + 604180
5   chromedriver                        0x0000000100765564 cxxbridge1$string$len + 321672
6   chromedriver                        0x0

In [39]:
# are there any rows where up_holdings is true?
print(df_exploded['up_holdings'].any())

True


In [40]:
# which ids have up_holdings as true?
print(df_exploded[df_exploded['up_holdings']])

                    key              F001 F010_str  \
0         9788126435432  9959112523503681      NaN   
4         9789380869780  9958147613503681      NaN   
5            3865814115  9977400846503681      NaN   
7  8171308090 paperback  9961600093503681      NaN   
8  8171308090 paperback  9961600093503681      NaN   

                                                F245  \
0  880-01 Prācīna lōkacaritraṃ / Her̲oḍōṭt...   
4       880-01 Himālaẏera sānnidhye / Svapana De.   
5  Sylvicultura oeconomica oder Haußwirthliche Na...   
7          880-01 Marunn / Punattil Kuññabduḷḷa.   
8          880-01 Marunn / Punattil Kuññabduḷḷa.   

                                    normalized_title normalized_edition  \
0  880-01 prācīna lōkacaritraṃ / her̲oḍōṭt...                NaN   
4       880-01 himālaẏera sānnidhye / svapana de.                NaN   
5  sylvicultura oeconomica oder haußwirthliche na...                NaN   
7          880-01 marunn / punattil ku