# HOLLIS Curation and Libraries

#### HOLLIS Curation
1. Choose classification input (QB, QC, etc)
2. Input records for ADS metadata updates
3. Input records for new ingests
___
NOTEBOOK OUTPUT:
- Metadata updates file: "{date}{classification}_metadata_updates.json"
- New ingests file: "{date}{classification}_ingest_new.json"

## HOLLIS Reviewed: Metadata Updates to ADS

In [None]:
date = "Test09"
classification = "QB"
# classification = "QC"

In [None]:
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import numpy as np
import re
import csv
import unicodedata
from openpyxl import load_workbook

# File input:
input_filename = date + classification + "_ingest_new.xlsx"
path = "/Users/sao/Documents/Python-Projects/hollis_harvest/" + classification + "/"

#File output (json):
metadata_updates_outfilename = date + classification + "_metadata_updates.json"
ingest_new_outfilename = date + classification + "_ingest_new.json"

# --------

dt = pd.read_excel(path + input_filename, sheet_name=1)
dt = pd.DataFrame(dt)

bibcodes = dt["bibcode"].astype(str)
authors = dt["authors"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
pubs = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
DOIs = dt["properties.DOI"].astype(str)
ELECTRs = dt["properties.ELECTR"].astype(str)

bib_ls = []
for bib in bibcodes:
    bib_ls.append(bib)
        
auth_ls = []
for author in authors:
    if author:
        auth_ls.append(author)
    else:
        auth_ls.append('')
        
title_ls = []
for t in titles:
    if t:
        title_ls.append(t)
    else:
        title_ls.append('')

pubdates_ls = []
for p in pubdates:
    pubdates_ls.append(p)
    
pubs_ls = []
for p in pubs:
    pubs_ls.append(p)
        
DOI_ls = []
for doi in DOIs:
    if doi != 'nan':
        DOI_ls.append(doi)
    else:
        DOI_ls.append('')
        
ELECTR_ls = []
for e in ELECTRs:
    if e != 'nan':
        ELECTR_ls.append(e)
    else:
        ELECTR_ls.append('')
        
links_ls = []
for doi, electr in zip(DOI_ls, ELECTR_ls):
    if doi:
        links_ls.append({"DOI":doi})
    elif electr:
        links_ls.append({"ELECTR":electr})
    else:
        links_ls.append('')
                
abs_ls = []        
for a in abstracts:
    if a != 'nan':
        abs_ls.append(a)
    else:
        abs_ls.append('')

records = []
for bib, auth, title, pubdate, pub, links, abstract in zip(bib_ls, auth_ls, title_ls, pubdates_ls, pubs_ls, links_ls, abs_ls):
    authors = auth.split("; ")
    records.append({"bibcode":bib,
                    "authors":authors,
                    "pubdate":pubdate,
                    "title":title,
                    "publication":pub,
                    "properties":links,
                    "abstract":abstract,
                    "source":"ADS"})
    
# Save json file of data
with open(path + metadata_updates_outfilename, 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as",metadata_updates_outfilename)

## HOLLIS Reviewed: Ingest New to ADS

In [None]:
import html

dt = pd.read_excel(path + input_filename, sheet_name=0)
dt = pd.DataFrame(dt)

authors = dt["authors"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
pubs = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
DOIs = dt["properties.DOI"].astype(str)
ELECTRs = dt["properties.ELECTR"].astype(str)
    
auth_ls = []
for author in authors:
    if author:
        auth_ls.append(author)
    else:
        auth_ls.append('')
        
title_ls = []
for t in titles:
    if title:
        title_ls.append(t)
    else:
        title_ls.append('')

pubdates_ls = []
for p in pubdates:
    pubdates_ls.append(p)
    
pubs_ls = []
for p in pubs:
    pubs_ls.append(p)
        
DOI_ls = []
for doi in DOIs:
    if doi != 'nan':
        DOI_ls.append(doi)
    else:
        DOI_ls.append('')
        
ELECTR_ls = []
for e in ELECTRs:
    if e != 'nan':
        ELECTR_ls.append(e)
    else:
        ELECTR_ls.append('')
        
links_ls = []
for doi, electr in zip(DOI_ls, ELECTR_ls):
    if doi:
        links_ls.append({"DOI":doi})
    elif electr:
        links_ls.append({"ELECTR":electr})
    else:
        links_ls.append('')
                
abs_ls = []        
for a in abstracts:
    if abstract != 'nan':
        abs_ls.append(a)
    else:
        abs_ls.append('')

records = []
for auth, title, pubdate, pub, links, abstract in zip(auth_ls, title_ls, pubdates_ls, pubs_ls, links_ls, abs_ls):
    authors = auth.split("; ")
    records.append({"bibcode":"",
                    "authors":authors,
                    "pubdate":pubdate,
                    "title":title,
                    "publication":pub,
                    "properties":links,
                    "abstract":abstract,
                    "source":"ADS"})

for r in records:
    html.unescape(r)
    
# Save json file of data
with open(path + ingest_new_outfilename, 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as",ingest_new_outfilename)

In [None]:
## Learn how to maniplulate serializers with file names