# Aleph-XML Request/Download
This script will request aleph-xml by bsn and write to xml.
Written by PJB, 2.15.18; last updated 2.16.18

*NB: This needs to be run within the NYU network; requests will fail otherwise.*

In [1]:
# imports
import urllib.request
from tqdm import tqdm #for progress bar

In [2]:
# constants
in_path = 'data/in'
out_path = 'data/out'

# files
# - script will process a file called bsns.txt in data/in/
# - output will be in the form {bsn}.xml in data/out/
# - output also includes two txt files: valid_bsns.txt and invalid_bsns.txt

In [3]:
# helper functions
def expand_bsn(item):
    """Check length of bsn; front pad with 0s to make length 7"""
    pad = 7 - len(item)
    if pad:
        item = '0' * pad + item
    return item

def get_invalid_bsns(bsns):
    invalid_bsns = []
    for bsn in bsns:
        if len(bsn) > 7:
            invalid_bsns.append(bsn)
        elif not bsn.isdigit():
            invalid_bsns.append(bsn)
    return invalid_bsns    

In [4]:
# retrieve bsns from plaintext file, add to list
with open('{}/bsns.txt'.format(in_path), 'r') as f:
    bsns = [line for line in (line.strip() for line in f) if line] #inner generator removes blank lines

# remove first line if header-like; i.e. first letter of first line is non-numeric
if bsns[0][0].isalpha():
    bsns.pop(0)
    
# preprocess/pad bsns    
bsns = [expand_bsn(bsn.strip()) for bsn in bsns]

In [5]:
# remove/report invalid bsns
invalid_bsns = sorted(get_invalid_bsns(bsns)) # get invalid; sort
bsns = sorted([bsn for bsn in bsns if bsn not in invalid_bsns]) #remove invalid from bsns; sort

# write valid_bsns to file
with open("{}/valid_bsns.txt".format(out_path), 'w') as f:
    for bsn in bsns:
        f.write('{}\n'.format(bsn))
        
# write invalid_bsns to file
with open("{}/invalid_bsns.txt".format(out_path), 'w') as f:
    for invalid_bsn in invalid_bsns:
        f.write('{}\n'.format(invalid_bsn))

In [6]:
# iterate over bsns; request aleph-xml; write to xml
for bsn in tqdm(bsns):
    urlstring = 'http://aleph.library.nyu.edu/X?op=publish_avail&library=nyu01&doc_num=%s' % bsn
    aleph_request = urllib.request.urlopen(urlstring)
    aleph_xml = aleph_request.read().decode('utf-8')
    with open("{}/{}.xml".format(out_path, bsn), 'w') as f:
        f.write(aleph_xml)

print('Processed {} records.'.format(len(bsns)))

100%|██████████| 824/824 [00:00<00:00, 509825.42it/s]

Processed 824 records.



