# STI/NTRS/PubSpace Harvester

This notebook sends a query to STI/NTRS API for harvesting bibliographic records. Data returned are processed through the ADS Reference Service to weed out records already existing in ADS. The Excel workbook output can be reviewed and curated for ingest to ADS.

#### STI Harvest
1. Set name for output file (date + category) and filepath
2. Select API endpoint for querying (regular STI Collection vs Pubspace) and parameters
3. Connect to STI API and pull/store data locally

#### ADS Bibcode Matching
4. Tranform STI results into ref strings
5. Query the ADS Reference Service API with ref strings, return bibcode matches
6. Output new STI records (unmatched by RefService) for ingest review
___
NOTEBOOK OUTPUT: 
- Excel workbook: "{name}_STIreview.xlsx"
   - Sheet 1: STI Harvest results
   - Sheet 2: Reference results
   - Sheet 3: New/unmatched items for ingest review

In [None]:
import requests
import json
import pandas as pd
import math
import unicodedata
from pyingest.serializers.classic import Tagged
import re
import csv

# -- Set name of output file (date_category)
name = "2306Geo"

# -- Set local filepath to save output files
filepath = "/Users/sao/Documents/Python-Projects/STI/"

# -- Choose collection to query
# path = "/citations/search"  # STI/NTRS Collection
path = "/pubspace/search"     # Pubspace Collection

# -- Input query parameters
params = {
         "published": {"gte":"2020-01-01"},
         "subjectCategory": ["Geophysics"],
#          "stiType": ["ACCEPTED_MANUSCRIPT"],
         "sort": {
             "field": "id",
             "order": "asc"},
         "page": {"size":100}
}


## NTRS API Query

In [None]:
# -- API Setup
base_url = "https://ntrs.nasa.gov/api"
api_url = base_url + path
MAX_RECORDS = 100

def get_batch(api_url, params):
    get_header = {'Accept': 'text/plain',
                  'Content-type': 'application/json'}
    buff = requests.post(api_url, headers=get_header, data=json.dumps(params)).json()
    return buff

def get_records(url, params):
    records = []
    
    # Do the first query
    try:
        batch = get_batch(url, params)
    except Exception as err:
        raise Exception("Request to STI blew up: %s" % err)
        
    # Count of total records
    totrecs = batch['stats']['total']
 
    # Store the first batch of records  
    records += batch['results']
    
    # Print count of total records and pages
    num_paginates = int(math.ceil((totrecs) / (1.0*MAX_RECORDS)))
    print("Total records: %d \nTotal pages: %d" % (totrecs, num_paginates))
          
    # Continue requests
    offset = MAX_RECORDS
    for i in range(num_paginates):
        params['page']['from'] = offset
        try:
            batch = get_batch(url, params)
        except Exception as err:
            raise URLError("Request to STI blew up: %s" % err)
        records += batch['results']
        offset += MAX_RECORDS
    return records

In [None]:
# -- Run API Request
from_sti = get_records(api_url, params)

# Normalize json results to generate DataFrame
dt = pd.json_normalize(from_sti, meta=['title'])

# List of desired fields
desired_fields = [
    "id",
    "subjectCategories",
    "fundingNumbers",
    "authorAffiliations",
    "title",
    "stiType",
    "abstract",
    "publications",
    "center.code",
    "otherReportNumbers",
    "keywords",
    "sourceIdentifiers",
    "meetings"
]

# Grab desired fields if present in dt
df = dt[[col for col in desired_fields if col in dt.columns]]

# Drop rows where Document ID is null
df = df.dropna(subset=['id'])

# Drop duplicates by Document ID
df = df.drop_duplicates(subset=['id'], keep='last')

## Metadata Wrangling

In [None]:
# TITLE/T
lsT = [t if t else '' for t in df['title']]

# ABSTRACT/B
lsB = [b if b else '' for b in df['abstract']]

# KEYWORDS/K
lsK = [', '.join(k) if isinstance(k, list) else '' for k in df['keywords']]

# STI Subject Categories - If any of the ES subject categories, insert 'Earth Science'
EScats = ['Geosciences', 
          'Earth Resources and Remote Sensing', 
          'Energy Production and Conversion',
          'Environment Pollution',
          'Geophysics', 
          'Meteorology And Climatology',
          'Oceanography']
STIsubcats = ['; '.join(s + ['Earth Science']) if any(cat in s for cat in EScats) else s for s in df['subjectCategories']]


# COLLECTION/W
lsW = []
for s in STIsubcats:
    if "Astronomy" in s:
        lsW.append('AST')
    elif "Geophysics" in s:
        lsW.append('')
    elif "Physics" in s:
        lsW.append('PHY')
    else:
        lsW.append('')


In [None]:
# AUTHORS/A & AFFILIATIONS/F

# Function to reformat author names if not already in format "Last, First"
    # Also appends periods '.' to initials if necessary
def reformat_author(author_name):
   
    # If comma in author name, assume it's in "Last, First" format already; no need to reformat
    if ', ' in author_name:
        name_parts = author_name.split(', ')
        last_name = name_parts[0]
        
        if len(name_parts) > 1:
            first_middle_parts = name_parts[1].split()
            first_name = first_middle_parts[0] if first_middle_parts else ''
            middle_name = ' '.join(first_middle_parts[1:]) if len(first_middle_parts) > 1 else ''
        else:
            first_name = ''
            middle_name = ''

        # Check and add period to first name if necessary
        if len(first_name) == 1 and not first_name.endswith('.'):
            first_name += '.'

        # Check and add period to middle name if necessary
        initials = middle_name.split()
        for i in range(len(initials)):
            if len(initials[i]) == 1 and not initials[i].endswith('.'):
                initials[i] += '.'
        middle_name = ' '.join(initials)

        formatted_name = "{}, {}".format(last_name, first_name)
        if middle_name:
            formatted_name += " {}".format(middle_name)

        return formatted_name
    
    # Reformat if there's no comma in author_name
    else:
        name_list = author_name.split()
        if len(name_list) > 0:
            first_name = name_list[0]
            last_name = ""
            middle_name = ""
            suffix = ""

            # Check for suffixes
            suffixes = ["Jr.", "Sr.", "II", "III"]
            if len(name_list) > 1:
                last_word = name_list[-1]
                if last_word in suffixes:
                    suffix = last_word
                    name_list = name_list[:-1]

                # Format name parts
                if len(name_list) > 1:
                    last_name = name_list[-1]
                    middle_name = " ".join(name_list[1:-1])
                else:
                    last_name = name_list[0]

                # Check and add period to first name if necessary
                if len(first_name) == 1 and not first_name.endswith('.'):
                    first_name += '.'

                # Check and add period to middle name if necessary
                if len(middle_name) == 1 and not middle_name.endswith('.'):
                    middle_name += '.'

            # Construct formatted name
            formatted_name = "{}, {}".format(last_name, first_name)
            if middle_name:
                formatted_name += " {}".format(middle_name)
            if suffix:
                formatted_name += ", {}".format(suffix)
            return formatted_name


# Grab Author/Affiliation Metadata
lsA = [] # Authors List
lsF = [] # Affiliations List

for authors in df['authorAffiliations']:
    row_authors = []
    row_affils = []

    for entry in authors:
        author = entry['meta']['author'].get('name', '')      # Concat author names
        reformatted = reformat_author(author)
        row_authors.append(reformatted)

        organization = entry['meta'].get('organization', {})  # Concat aff names
        aff_name = organization.get('name', '')
        aff_loc = organization.get('location', '')            # Concat aff locations
        affil = ', '.join(filter(None, [aff_name, aff_loc]))

        orcid = entry['meta']['author'].get('orcidId', '')    # Concat orcids
        if orcid:
            affil += ' <ID system="ORCID">{}</ID>'.format(orcid)

        row_affils.append(affil)

    lsA.append(row_authors)
    lsF.append(row_affils)


In [None]:
# STI Stuff
STI_ids = [d if d else '' for d in df['id']]
STI_types = [t if t else '' for t in df['stiType']]
centers = [c if c else '' for c in df['center.code']]
    
# Report Numbers
reportNums = []
for numbers in df['otherReportNumbers']:
    if isinstance(numbers, list):
        for entry in numbers:
            if numbers != '[]':
                reportNums.append(numbers)
    else:
        reportNums.append('')
    
# Funding Numbers
fundingNums = []
for numbers in df['fundingNumbers']:
    if isinstance(numbers, list):
        for entry in numbers:
            if isinstance(entry, dict) and 'type' in entry and 'number' in entry:
                fundType = entry['type']
                fundNum = entry['number']
                fundingNums.append('{}: {}'.format(fundType, fundNum))
            else:
                fundingNums.append('')

# Concat Numbers                
otherNums = []            
for STIid, reportNum, fundNum in zip(STI_ids, reportNums, fundingNums):
    others = ''
    if STIid:
        sti = "STI: {}".format(STIid)
        others += sti
    if reportNum:
        others += ", " + ", ".join(r for r in reportNum)
    if fundNum:
        others += ", " + fundNum
    otherNums.append(others)

In [None]:
# PUBDATE/D, PUBLICATION/J, LINK PROPERTIES/I 
## -- future work -> Insert meeting info; sub a different date for missing pubdate

# Metadata for refstrings
pubs_ls = [] # Pubnames
vols_ls = [] # Volumes
dois_ls = [] # DOIs

# Metadata for data curation/ingest
lsD = []  # Pubdates
lsJ = []  # Journal/Pub
lsI = []  # Properties/Links

for pubs, meets, idents, docID in zip(df['publications'], df['meetings'], df['sourceIdentifiers'], STI_ids):
    j = ''
    links = ''
    pubnames = ''
    volumes = ''
    dois = ''
    
    for entry in pubs:
        pubdate = entry.get('publicationDate', '')[:10]
        publication_name = entry.get('publicationName', '')
        vol = entry.get('volume', '')
        issue = entry.get('issue', '')
        publisher = entry.get('publisher', '')
        issn = entry.get('issn', '')
        eissn = entry.get('eissn', '')
        isbn = entry.get('isbn', '')
        eisbn = entry.get('eisbn', '')
        url = entry.get('url', '')
        doi = entry.get('doi', '')
            
        if publication_name:
            j += publication_name.rstrip(" ")
            pubnames += publication_name.rstrip(" ")
        if vol:
            j += ', Vol. {}'.format(vol)
            volumes += vol
        if issue:
            j += ', Issue {}'.format(issue)
        if publisher:
            j += ', Published by {}, {}'.format(publisher, pubdate[:4]).rstrip(" ").lstrip(" ")
        if issn:
            j += ', ISSN: {}'.format(issn)
        if eissn:
            j += ', eISSN: {}'.format(eissn)
        if isbn:
            j += ', ISBN: {}'.format(isbn)
        if eisbn:
            j += ', eISBN: {}'.format(isbn)
        j = j.lstrip(', ')

        # Concat link info from publications field
        doi = doi.replace('https://', '').replace('doi:', '').replace('doi.org/', '').replace('DOI:', '').lstrip(" ")
        if url and doi:
            links += 'ELECTR: {}; DOI: {}'.format(url, doi)
            dois += doi
        elif url:
            links += 'ELECTR: {}'.format(url)
        elif doi:
            links += 'DOI: {}'.format(doi)
            dois += doi
    
    if isinstance(meets, list):         # Concat meeting info
        for entry in meets:
            if isinstance(entry, dict) and 'name' in entry and entry.get('name') != '':
                if 'location' in entry and entry.get('location') != '':
                    meet = entry['name']
                    meet_loc = entry['location']
                    j += "; {}, {}".format(meet, meet_loc)
                else:
                    j += "; {}.".format(meet)
    
    if isinstance(idents, list):       # Concat additional links from sourceIdentifiers field
        for entry in idents:
            if isinstance(entry, dict):
                if entry.get('type') == 'URL' and 'number' in entry:
                    if 'arXiv' in entry['number'] or 'arxiv.org' in entry['number']:
                        arxiv = entry['number'].replace('arXiv:','')
                        links += '; ARXIV: {}'.format(arxiv)
                    else:
                        url = entry['number']
                        links += '; ELECTR: {}'.format(url)
                if entry.get('type') == 'DOI' and 'number' in entry:
                    doi = entry['number'].replace('doi:', '')
                    links += '; DOI: {}'.format(doi)
                    dois += doi
    
    
    links += '; ELECTR: https://ntrs.nasa.gov/citations/{}'.format(docID)
        
    # Append metadata to lists for refstrings and curation/ingest
    pubs_ls.append(pubnames)
    vols_ls.append(volumes)
    dois_ls.append(dois)
    lsD.append(pubdate)
    lsJ.append(j.lstrip('; '))
    lsI.append(links.lstrip('; '))


In [None]:
# Zip metadata into list of records for curation/ingest
records = []
for A, F, T, D, J, B, I, W, K, subcat, STI_type, center, other in zip(lsA, lsF, lsT, lsD, lsJ, lsB, lsI, lsW, lsK, STIsubcats, STI_types, centers, otherNums):
    record = {
        "authors": '; '.join([a for a in A if a is not None]),
        "affiliations": '; '.join([f for f in F if f is not None]),
        "title": T,
        "pubdate": D,
        "publication": J,
        "abstract": B,
        "properties": I,
        "collection": W,
        "keywords": K,
        "STI subject categories": subcat,
        "type": STI_type,
        "NASA center": center,
        "other": other
    }
    records.append(record)


## Reference Resolver Service - Match Existing ADS Records

In [None]:
# Prepare reference strings to query ADS Reference Resolver Service
list_for_REFS = []

    # -- Option to generate refstrings of {Author, Year, Publication, Volume}
for A, D, T, pub, vol, doi in zip(lsA, lsD, lsT, pubs_ls, vols_ls, dois_ls):
    
    # Year = first 4 digits of pubdate
    D = D[:4]
    
    # Grab just first author
    if isinstance(A, list):
        A = '; '.join(A[:10])
    
    # Concat refstrings
    if A and D and pub and vol:
        ref = {
            "refstr":"%s, %s, %s %s, %s"%(A, D, pub, vol, doi), 
            "authors": A, 
            "year": D, 
            "journal":"%s %s"%(pub, vol),
            "doi": doi
        }

    elif A and D and T:
        ref = {
            "refstr":"%s, %s, %s"%(A, D, T), 
            "authors":"%s"%A, 
            "year":"%s"%D, 
            "title": "%s"%T
        }
    
    ref_string = json.dumps(ref, ensure_ascii=False)
    ref_string = ref_string.replace("\\t", "")

    list_for_REFS.append(ref_string)

In [None]:
# -- Reference Resolver Service Setup

# ADS Prod API Token
token = 'pHazHxvHjPVPAcotvj7DIijROZXUjG5vXa2OaCQO'
domain = 'https://api.adsabs.harvard.edu/v1/'

# Reference Service API request, querying my 'references' list
def resolve(references):
    payload = {'parsed_reference': references}
    response = requests.post(
        url = domain + 'reference/xml',
        headers = {'Authorization': 'Bearer ' + token,
                 'Content-Type': 'application/json',
                 'Accept':'application/json'},
        data = json.dumps(payload))
    if response.status_code == 200:
        return json.loads(response.content)['resolved'], 200
    else:
        print('From reference status_code is ', response.status_code)
    return None, response.status_code

# -- Run Reference Resolver Service
references = list_for_REFS
references = [ref.replace("\n"," ") for ref in references]
references = [json.loads(ref) for ref in references]

# Resolve my references, results in 'total results' list
total_results = []
print("Querying %d references with the Reference Service ..."%len(references))
for i in range(0, len(references), 16):
    results, status = resolve(references[i:i+16])
    if results:
        total_results += results

# REF RESULTS (df2): Format all results to tab separated list
bibcodes = []
bibcode_counter = 0
no_match_counter = 0
for r in total_results:   # if match found, return refstring, bibcode, and score
    if r['bibcode']!='...................':
        bibcodes.append(r['refstring'] + "\t" + r['bibcode'] + "\t" + r['score'])
        bibcode_counter += 1
    else:                # if no match found, return refstring and error comment
        bibcodes.append(r['refstring'] + "\t\t\t" + r.get('comment', ''))
        no_match_counter += 1
        
df2 = pd.DataFrame(line.split("\t") for line in bibcodes if line)   # Generate data frame
df2.columns = ['refstring','bibcode','score','comment']
df2 = df2.drop_duplicates(subset='refstring')                       # Drop duplicates
df2 = df2.sort_values(by=['score','comment'],ascending=False)       # sort by score, then comment

# NEW INGESTS (df3): Format non-matched results to new list for ingest
to_ingest = []
for record, result in zip(records, total_results):
    if result['bibcode'] == '...................':
        to_ingest.append(record)

df3 = pd.json_normalize(to_ingest)                  # Generate data frame of new ingests
df3 = df3.sort_values(by=['title'])                 # sort by title

## Output Results

In [None]:
# Save results to an excel file with multiple sheets
outfile = name + "_STIreview.xlsx"
with pd.ExcelWriter(filepath + outfile) as writer:
    df.to_excel(writer, sheet_name='sti_output', index=False)   # Original NTRS API output
    df2.to_excel(writer, sheet_name='ref_results', index=False) # Reference Resolver results
    df3.to_excel(writer, sheet_name='ingest_new', index=False)  # non-matched records for ingest review
    
# Print summary
print(
    'RESULTS SUMMARY\n\n STI API Query Parameters: {}\n'.format(params),
    '\n > Records generated: {}\n > Records matched (ADS): {}\n > Records for ingest: {}'.format(len(references), bibcode_counter, len(to_ingest)),
    '\n Results saved to {}'.format(outfile)
     )