## HOLLIS Harvest Notebook

#### HOLLIS Harvest
1. Connect to HOLLIS API and pull/store data locally
    - When re-running for new HOLLIS records, copy this notebook to a new directory, and just change the query_params at the top.

#### Bibcode Matching
2. Tranform HOLLIS results into ref strings
3. Query the RefService API with ref strings, return bibcode matches
___
PROJECT OUTPUT: 
- HOLLIS Harvest results, "hollis_results.json" and "hollis_results.xlsx"
- List of reference strings, "ref_list.txt"
- List of bibcodes matched, "ref_results.txt"
- Folder of HOLLIS json files, "items" folder > "id_{hollis id}.json"

## HOLLIS Harvest

In [1]:
HOLLIS_API = 'https://api.lib.harvard.edu/v2/items.json'
MAX_RECORDS = 250

query_params = {
    'classification':'QB*',
    'originDate':'2012 OR 2013 OR 2014 OR 2015 OR 2016 OR 2017 OR 2018 OR 2019 OR 2020 OR 2021 OR 2022',
    'resourceType':'text',
    'issuance':'monographic'
}

In [2]:
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import numpy as np
import re
import csv
import unicodedata

def get_batch(api_url, params):
    get_header = {'Accept': 'text/plain',
                  'Content-type': 'application/json'}
    buff = requests.get(api_url, headers=get_header, params=params).json()
    return buff

def get_records(url, params):
    records = []
    params['limit'] = 1   # First get 1 record to determine the total amount of records
    
    # Do the first query
    try:
        batch = get_batch(url, params)
    except Exception as err:
        raise Exception("Request to Hollis blew up: %s" % err)

    totrecs = batch['pagination']['numFound']
 
    #    Store the first batch of records
    #       Note: the individual records are in the 'mods' attribute
    #       of 'items'. In the case of multiple records, this
    #       is a list of JSON objects, but when only 1 record is
    #       returned, it is just a JSON object (no list).     
    records.append(batch['items']['mods'])
    
    # How often do we need to paginate to get them all?
    num_paginates = int(math.ceil((totrecs) / (1.0*MAX_RECORDS)))
    
    # We harvested the first record to get the total number of records,
    # so we continue with the 2nd
    offset = 1
    params['limit'] = MAX_RECORDS
    for i in range(num_paginates):
        params['start'] = offset
        try:
            batch = get_batch(url, params)
        except Exception as err:
            raise URLError("Request to Hollis blew up: %s" % err)
        records += batch['items']['mods']
        offset += MAX_RECORDS
    return records
        
def get_unique_records(ids, from_hollis):
    """
    goes through the list from_hollis, if the url not in the ids list, 
    adds it in the record to the returned structure

    :param ids:
    :param from_hollis:
    :return:
    """
    unique = []
    for items in from_hollis:
        for one in items.get('extension', []):
            # basing uniqueness on the url provided in originalDocument
            orig_url = one.get('librarycloud', {}).get('originalDocument', '')
            if orig_url not in ids:
                ids.append(orig_url)
                unique.append(items)
    return ids, unique

unique_records = []     # save all unique records in this list
ids = []                # empty list of ids to start

# go through the loop, get items from hollis, call get_unique_records until returned structure is empty
counter = 0
while True:
    from_hollis = get_records(HOLLIS_API, query_params)
    counter += 1
    ids, unique_records_set = get_unique_records(ids, from_hollis)
    print('Attempt',counter,'got number of unique records:', len(unique_records_set),'\n')
    # if no duplicates quit
#     if len(from_hollis) == len(unique_records_set):
#         print('no duplicates, quitting')
#         break
    # if no new records quit
    if not unique_records_set:
        print('no new records, quitting \n')
        break
    # add the new records from this set to the structure
    unique_records += unique_records_set

# Save excel file of results
df = pd.json_normalize(unique_records)
df.to_excel("hollis_results.xlsx", index=False)
print("Retrieved",len(unique_records),"records and saved results as hollis_results.xlsx")

Attempt 1 got number of unique records: 2590 

Attempt 2 got number of unique records: 20 

Attempt 3 got number of unique records: 83 

Attempt 4 got number of unique records: 0 

no new records, quitting 

Retrieved 2693 records and saved results as hollis_results.xlsx


## Metadata Extraction & Transformation

In [9]:
# Open my Hollis results as a data frame
dt = pd.read_excel("/Users/sao/Documents/Python-Projects/hollis_harvest/hollis_results.xlsx")

# HOLLIS ID
hollis_ids = dt["recordInfo.recordIdentifier.#text"].astype(str)
hollis_idents = []
for ident in hollis_ids:
    if ident:
        hollis_idents.append(ident)
    else:
        hollis_idents.append('')

# AUTHORS : Extract authors from dt["name"], else dt["name.namePart"] 
extract_name = re.compile(r"(?:personal\W+namePart\W+([A-Za-z\.,\s]+).+?(?=author))+")
extract_name2 = re.compile(r"^\W*([A-Z][^{[]*)")
names = dt["name"].astype(str)
names2 = dt["name.namePart"].astype(str)

author_matches = []
for name,name2 in zip(names,names2):
    name = unicodedata.normalize('NFD', name).encode('ascii', 'ignore').decode()
    match = extract_name.findall(name)
    if match:
        match = [m.split(" (")[0] for m in match]
        match = [re.sub(r'([a-z])\.$',r'\1', m) for m in match]
        author_matches.append('; '.join(match))
    else:
        if name2 != 'nan':
            match = extract_name2.findall(name2)
            if match:
                match = [m.rstrip().rstrip(",").rstrip("'").split(" (")[0] for m in match]
                match = [re.sub(r'([a-z])\.$',r'\1', m) for m in match]
                author_matches.append('; '.join(match))
            else:
                author_matches.append('')
        else:
            author_matches.append('')

# PUBDATE : Extract pubdate from dt["originInfo"]; else dt["originInfo.dateIssued"]
extract_year = re.compile(r"dateIssued': {'@encoding': 'marc', '#text': '(\d+)")
extract_year2 = re.compile(r"text': '(\d+)")
years = dt["originInfo"].astype(str)
years2 = dt["originInfo.dateIssued"].astype(str)

date_matches = []
for year,year2 in zip(years,years2):
    match = extract_year.findall(year)
    if match:
        date_matches.append(match[0])
    else:
        if year2 != 'nan':
            match = extract_year2.findall(year2)
            if match:
                date_matches.append(match[0])
            else:
                date_matches.append('')
        else:
            date_matches.append('')
            
# TITLE : Extract title from dt["titleInfo"] --> {'nonSort': 'The  ', 'title': '<title>', 'subTitle': '<subtitle>'};
#      else, extract from dt["titleInfo.nonSort"], dt["titleInfo.title"], dt["titleInfo.subTitle"] 
extract_titles = [
re.compile(r"nonSort\W+(\S*)\W+title\W+([\.\w\s\-\'\&\,\:\(\)]+)\W+subTitle\W+([\.\w\s\-\'\&\,\:\(\)]+)"),
re.compile(r"title\W+([\.\w\s\-\'\&\,\:\(\)]+)\W+subTitle\W+([\.\w\s\-\'\&\,\:\(\)]+)"),
re.compile(r"nonSort\W+([\.\w\s\-\']+)\W+title\W+([\.\w\s\-\'\&\,\:\(\)]+)"),
re.compile(r"title\W+([\.\w\s\-\'\&\,\:\(\)]+)")
]
titles = dt["titleInfo"].astype(str)
titleInfo_nonsorts = dt["titleInfo.nonSort"].astype(str)
titleInfo_titles = dt["titleInfo.title"].astype(str)
titleInfo_subtitles = dt["titleInfo.subTitle"].astype(str)

title_matches = []
for title, nonsort, title_2, subtitle in zip(titles, titleInfo_nonsorts, titleInfo_titles, titleInfo_subtitles):
    if title != 'nan':
        title = unicodedata.normalize('NFD', title).encode('ascii', 'ignore').decode()
        title = title.replace('"',"'")
        match_t = extract_titles[0].search(title)
        match_t2 = extract_titles[1].search(title)
        match_t3 = extract_titles[2].search(title)
        match_t4 = extract_titles[3].search(title)
        
        if match_t:
            title_matches.append("%s %s: %s"%(match_t.groups(0)[0],match_t.groups(0)[1].rstrip(" ").rstrip(",").rstrip("'"),match_t.groups(0)[2].rstrip("'").rstrip(" ")))
        elif match_t2:
            title_matches.append("%s: %s"%(match_t2.groups(0)[0].rstrip(" ").rstrip(",").rstrip("'"),match_t2.groups(0)[1].rstrip(",").rstrip("'").rstrip(" ")))
        elif match_t3:
            title_matches.append("%s %s"%(match_t3.groups(0)[0].rstrip(",").rstrip("'").rstrip(" "),match_t3.groups(0)[1].rstrip(",").rstrip("'").rstrip(" ")))
        elif match_t4:
            title_matches.append("%s"%(match_t4.groups(0)[0].rstrip(",").rstrip("'").rstrip(" ")))
        else:
            title_matches.append('')
    else:
        sec_title_str = ''
        for sec_title in [nonsort, title_2, subtitle]:
            if sec_title != "nan":
                if sec_title == title_2 and subtitle != 'nan':
                    sec_title_str += (sec_title.strip().replace('"',"'") + ': ')
                else:
                    sec_title_str += (sec_title.strip().replace('"',"'") + ' ')
        if sec_title_str:
            title_matches.append(sec_title_str.rstrip().replace('"',"'"))
        else:
            title_matches.append('')

# IDENTIFIERS : Extract identifiers from dt["identifier"]; else dt["identifier.#text"] 
extract_doi = re.compile(r"doi\W+#text\W+(\S*)\'")
extract_oclc = re.compile(r"oclc\W+#text\W+(\S*)\'")
extract_isbn = re.compile(r"isbn\W+[@invalid\W+yes\W+]*#text\W+([\w\-]*)")
idents = dt["identifier"].astype(str)
idents2 = dt["identifier.#text"].astype(str)

doi_matches = []
for doi in idents:
    if doi != 'nan':
        match = extract_doi.findall(doi)
        if match:
            doi_matches.append(match[0])
        else:
            doi_matches.append('')
    else:
        doi_matches.append('')
           
oclc_matches = []
oclc_count = 0 
for oclc_id,oclc_id2 in zip(idents,idents2):
    found = False
    if oclc_id != 'nan':
        match = extract_oclc.findall(oclc_id)
        if match:
            oclc_count += 1
            found = True
            oclc_matches.append(match[0])
    if oclc_id2 != 'nan' and found == False:
        match = extract_oclc.findall(oclc_id2)
        if match:
            oclc_count += 1
            oclc_matches.append(oclc_id2)
        else:
            oclc_matches.append('')
    elif found == False:
        oclc_matches.append('')

isbn_matches = []
isbn_count = 0
for isbn_id,isbn_id2 in zip(idents,idents2):
    found = False
    if isbn_id != 'nan':
        match = extract_isbn.findall(isbn_id)
        if match:
            isbn_count += 1
            found = True
            isbn_matches.append(match[0])
    if isbn_id2 != 'nan' and found == False:
        match = extract_isbn.findall(isbn_id2)
        if match:
            isbn_count += 1
            isbn_matches.append(isbn_id2)
        else:
            isbn_matches.append('')
    elif found == False:
        isbn_matches.append('')

# PUBLICATION : Extract publisher & city from dt["originInfo"], else dt["originInfo.publisher"] 
extract_pubs = [
re.compile(r"placeTerm\W+type\W+\w+\W+text\W+([\w\s\-\']+)"),
re.compile(r"publisher\W+([\.\w\s\-\'\&]+)")
]
pubs = dt["originInfo"].astype(str)
publishers = dt["originInfo.publisher"].astype(str)

pub_matches = []
for pub, publisher in zip(pubs, publishers):
    if pub != 'nan':
        pub = unicodedata.normalize('NFD', pub).encode('ascii', 'ignore').decode()
        match_pub = extract_pubs[0].search(pub)
        match_pub2 = extract_pubs[1].search(pub)
        if match_pub and match_pub2:
            pub_matches.append("%s: %s"%(match_pub.groups(0)[0].rstrip(" ").rstrip(":").rstrip(" ").rstrip(",").rstrip(";"),match_pub2.groups(0)[0].rstrip(" ").rstrip(":").rstrip(" ").rstrip(",").rstrip(";")))
        elif match_pub:
            pub_matches.append(match_pub.group(0)[0].rstrip(" ").rstrip(":").rstrip(" ").rstrip(",").rstrip(";"))
        elif match_pub2:
            pub_matches.append(match_pub2.group(0)[0].rstrip(" ").rstrip(":").rstrip(" ").rstrip(",").rstrip(";"))
    elif publisher != 'nan':
        publisher = unicodedata.normalize('NFD', publisher).encode('ascii', 'ignore').decode()
        pub_matches.append(publisher)
    else:
        pub_matches.append('')

#    String together metdata, including OCLC and ISBNs, for publication field
publications = []
for title, author, pub, year, oclc, isbn in zip(title_matches, author_matches, pub_matches, date_matches, oclc_matches, isbn_matches):
    authors = author.split("; ")
    if len(authors) > 5:
        info = ''.join(title + ", by " + '; '.join(authors[:5]) + " et. al., " + str(year) + ".")
    if len(authors) <= 5:
        info = ''.join(title + ", by " + author + ", " + str(year) + ".")
    p = str(info+" "+pub)
    o = str("OCLC: "+oclc+".")
    b = str("ISBN: "+isbn+".")
    
    if title and author and year:
        author = str([m.replace('"',"'").replace('\r','').replace('\n','') for m in author])
        year = year[0].replace('"',"'").replace('\r','').replace('\n','') 
        if pub:
            if oclc:
                if isbn:
                    publications.append(str(p+". "+o+" "+b))
                else:
                    publications.append(str(p+". "+o))
            elif isbn:
                publications.append(str(p+". "+b))
            else:
                publications.append(str(p+"."))
        elif oclc:
            if isbn:
                publications.append(str(o+". "+b))
            else:
                publications.append(o)
        elif isbn:
            publications.append(b)
        else:
            publications.append(info)
    else:
        publications.append('')

# ABSTRACT: Extract abstracts from dt["abstract.#text"]
extract_abs = re.compile(r".*")
abstracts = dt["abstract.#text"].astype(str)

abs_matches = []
for abstract in abstracts:
    if abstract != 'nan':
        match = extract_abs.findall(abstract)
        match = [m.rstrip("--").replace(u'\xa0', u' ') for m in match]
        abs_matches.append(match[0])
    else:
        abs_matches.append([])
        
# PROPERTIES: if doi exists, append doi; else append worldcat+ISBN, else worldcat+OCLC, else worldcat+title
links = []
for doi, oclc, isbn, title in zip(doi_matches, oclc_matches, isbn_matches, title_matches):
    if doi:
        links.append({"DOI":doi})
    elif oclc:
        links.append({"ELECTR": str("http://www.worldcat.org/oclc/"+str(oclc))})
    elif isbn:
        links.append({"ELECTR": str("http://www.worldcat.org/isbn/"+str(isbn))})
    elif title:
        title = re.sub(r"\s","-",title)
        title = re.sub(r":","",title)
        title = re.sub(r",","",title)
        title = re.sub(r"\.","",title)
        links.append({"ELECTR": str("http://www.worldcat.org/title/"+str(title))})
    else:
        links.append('')

records = []
for ident, author, year, title, pub, link, abstract in zip(hollis_idents, author_matches, date_matches, title_matches, publications, links, abs_matches):
    if author and year and title:
        authors = author.split("; ")
        authors = unicodedata.normalize('NFD', author).encode('ascii', 'ignore').decode()
        title = unicodedata.normalize('NFD', title).encode('ascii', 'ignore').decode()
        records.append({"hollis_id":ident,
                        "authors":authors,
                        "pubdate":year,
                        "title":title,
                        "publication":pub,
                        "properties":link,
                        "abstract":abstract})

print("Refined and generated",len(records),"HOLLIS records")   


Refined and generated 1880 HOLLIS records


## ADS Reference Service: Bibcode Matching

In [10]:
# Create reference strings
list_for_REFS = []
for author, year, title in zip(author_matches, date_matches, title_matches):
    if author and year and title:
        ref = {"refstr":"%s, %s, %s"%(author, year, title), "authors":"%s"%author, "year":"%s"%year, "title": "%s"%title}
        list_for_REFS.append(json.dumps(ref))

# # Save refs to txt file
# df = pd.DataFrame(list_for_REFS, columns=['REFS'])
# df.to_csv("/Users/sao/Documents/Python-Projects/hollis_harvest/ref_list.txt", index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)

# -- REFERENCE SERVICE -- #

# ADS Prod API Token
token = '<my token here>'
domain = 'https://api.adsabs.harvard.edu/v1/'

# Read my reference strings file and make a list called 'references'
def read_file(filename):
    references = []
    with open(filename, "r") as f:
        for line in f:
            references.append(line)
    return references

# Reference Service API, querying my 'references' list
def resolve(references):
    payload = {'parsed_reference': references}
    response = requests.post(
        url = domain + 'reference/xml',
        headers = {'Authorization': 'Bearer ' + token,
                 'Content-Type': 'application/json',
                 'Accept':'application/json'},
        data = json.dumps(payload))
    if response.status_code == 200:
        return json.loads(response.content)['resolved'], 200
    else:
        print('From reference status_code is ', response.status_code)
    return None, response.status_code

# Output the Reference Service results
def output(results, status):
    if results:
        print('\n')
        for result in results:
            print(result)
        print('\n')
    else:
        print('error code: ', status)
        
# Read my reference strings file
# references = read_file("/Users/sao/Documents/Python-Projects/hollis_harvest/ref_list.txt")
references = list_for_REFS
references = [ref.replace("\n","") for ref in references]
references = [json.loads(ref) for ref in references]

# Resolve my references, results in 'total results' list
total_results = []
for i in range(0, len(references), 32):
    results, status = resolve(references[i:i+32])
    if results:
        total_results += results
        
# Count how many bibcodes matched, results to csv file
bibcodes = []
for r in total_results:
    if r['bibcode']!='...................':
        bibcodes.append(r['refstring'] + "\t" + r['bibcode'] + "\t" + r['score'])
dedupe_bibcodes = list(dict.fromkeys(bibcodes))   # deduplicate bibcodes

# Save bibcode matches to csv
with open('ref_results.csv', 'w') as f:
    lines = (line.split("\t") for line in bibcodes if line)
    writer = csv.writer(f)
    writer.writerow(('refstring', 'bibcode', 'score'))
    writer.writerows(lines)

# Ref Summary
print("Ran",len(references),"references through ADS Reference Service")
print("Matched",len(dedupe_bibcodes),"bibcodes (deduplicated) and saved results to 'ref_results.csv'")

Ran 1880 references through ADS Reference Service
Matched 378 bibcodes (deduplicated) and saved results to 'ref_results.csv'


## Remove matched records, Curate new records for ingest

In [11]:
to_ingest = []
ingest_counter = 0
for record, result, hollis_ident in zip(records, total_results, hollis_idents):
    if result['bibcode'] == '...................':
        to_ingest.append(record)

# # Save master json file of final results
# with open("hollis_records_new.json", 'w') as outfile:
#     json.dump(to_ingest, outfile)

# Save excel file of results
df = pd.json_normalize(to_ingest)
df.to_excel("hollis_records_new.xlsx", index=False)

print("Saved",len(to_ingest),"final records as 'hollis_records_new.xlsx' for review")

Saved 1426 final records as 'hollis_records_new.xlsx' for review


## Results Summary

In [12]:
print(
    'RESULTS SUMMARY\n\n'
    'HOLLIS API Query Parameters:\n',
    query_params,'\n\n',
    '-- Total HOLLIS records retrieved:',len(unique_records),'\n',
    '-- Records refined/refstrings generated:',len(references),'\n',
    '-- ADS Bibcodes matched (deduplicated):',len(dedupe_bibcodes),'\n',
    '-- New records for ingest review:',len(to_ingest)
     )

RESULTS SUMMARY

HOLLIS API Query Parameters:
 {'classification': 'QB*', 'originDate': '2012 OR 2013 OR 2014 OR 2015 OR 2016 OR 2017 OR 2018 OR 2019 OR 2020 OR 2021 OR 2022', 'resourceType': 'text', 'issuance': 'monographic', 'limit': 250, 'start': 2501} 

 -- Total HOLLIS records retrieved: 2693 
 -- Records refined/refstrings generated: 1880 
 -- ADS Bibcodes matched (deduplicated): 378 
 -- New records for ingest review: 1426


In [None]:
dt = pd.read_excel("/Users/sao/Documents/Python-Projects/hollis_harvest/hollis_records_review.xlsx", sheet_name=2)
dt = pd.DataFrame(dt)

ingest = dt["ingest"].astype(str)
authors = dt["authors"].astype(str)
titles = dt["title"].astype(str)
pubdate = dt["pubdate"].astype(str)
pub = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
DOIs = dt["properties.DOI"].astype(str)
ELECTRs = dt["properties.ELECTR"].astype(str)


auth_ls = []
for author in authors:
    if author:
        match = unicodedata.normalize('NFD', author).encode('ascii', 'ignore').decode()
        authors = match.split("; ")
        author_names = []
        for a in authors:
            auth_first = a.rsplit(" ", 1)[0]
            auth_last = a.rsplit(" ", 1)[-1]
            auth = ("%s, %s"%(auth_last,auth_first))
            author_names.append(auth)
        auth_ls.append("; ".join(author_names))
    else:
        auth_ls.append('')
        
title_ls = []
for title in titles:
    if title:
        t = title.replace("’","'").replace("‘","'").replace('“','"').replace('”','"')
        title_ls.append(t)
    else:
        title_ls.append('')
        
DOI_ls = []
for doi in DOIs:
    if doi:
        DOI_ls.append(doi)
    else:
        DOI_ls.append('')
        
PDF_ls = []
for pdf in PDFs:
    if pdf:
        PDF_ls.append(pdf)
    else:
        PDF_ls.append('')
        
links_ls = []
for doi, pdf in zip(DOI_ls, PDF_ls):
    if doi and pdf:
        links_ls.append({"DOI":doi,"PDF":pdf})
    elif doi:
        if not pdf:
            links_ls.append({"DOI":doi})
    elif pdf:
        if not doi:
            links_ls.append({"PDF":pdf})
    else:
        links_ls.append('')
                
abs_ls = []        
for abstract in abstracts:
    if abstract:
        a = unicodedata.normalize('NFD', abstract).encode('ascii', 'ignore').decode()
        abs_ls.append(a)
    else:
        abs_ls.append('')

records = []
for auth, title, links, abstract in zip(auth_ls, title_ls, links_ls, abs_ls):
    authors = auth.split("; ")
    records.append({"bibcode":"",
                    "authors":authors,
                    "pubdate":pubdate,
                    "title":title,
                    "publication":pub,
                    "properties":links,
                    "abstract":abstract,
                    "source":"ADS"})
# for r in records:
#     html.unescape(r)
    
# Save json file of data
with open("curation.json", 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as curation.json")