# HOLLIS Curation

NOTEBOOK INPUT:
1. Choose classification input (QB, QC, etc), collection, and subject category
2. Choose process to run for records: new ingests or metadata updates

NOTEBOOK OUTPUT:
- json file and tagged file named: "{date}{classification}{process}.{file format}"

In [None]:
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import numpy as np
import re
import csv
import unicodedata
from openpyxl import load_workbook
import html
from pyingest.serializers.classic import Tagged


## --- INPUT DATE OF PROCESSING
date = "2305"

## --- SELECT PROCESS TO RUN
# sheet = "metadata_updates"
sheet = "ingest_new"

## --- SELECT CLASSIFICATION, COLLECTION, SUBJECT CATEGORY
# #   ASTRONOMY:
# classification = "QB"
# collection = "AST"

# #   PHYSICS:
# classification = "QC"
# collection = "PHY"

# #   EARTH SCIENCE:
# classification = "QE"
# collection = ""
# subcat = "Earth Science"


In [None]:
# FILE INPUT
input_filename = date + classification + "_reviewed.xlsx"
path = "/Users/sao/Documents/Python-Projects/hollis_harvest/" + classification + "/"

# FILE OUTPUT
if sheet == "ingest_new":
    outfilename = date + classification + "_ingest"
if sheet == "metadata_updates":
    outfilename = date + classification + "_updates"
json_output = outfilename + ".json"
tagged = outfilename + ".tag"

# READ FILE AND GET METADATA
dt = pd.read_excel(path + input_filename, sheet_name=sheet)
dt = pd.DataFrame(dt)

if "bibcode" not in dt.columns:
    dt["bibcode"] = np.nan

if "subject category" not in dt.columns:
    dt["subject category"] = np.nan

bibcodes = dt["bibcode"].astype(str)
subcats = dt["subject category"].astype(str)
authors = dt["authors"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
pubs = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
DOIs = dt["properties.DOI"].astype(str)
ELECTRs = dt["properties.ELECTR"].astype(str)

bibcodes_ls = [b if b != 'nan' else '' for b in bibcodes]
subcats_ls = [s if s != 'nan' else '' for s in subcats]
auth_ls = [a if a != 'nan' else '' for a in authors]
title_ls = [t if t != 'nan' else '' for t in titles]
pubdates_ls = [d.replace('.0','') if d != 'nan' else '' for d in pubdates]
pubs_ls = [p if p != 'nan' else '' for p in pubs]
DOI_ls = [doi if doi != 'nan' else '' for doi in DOIs]
ELECTR_ls = [e if e != 'nan' else '' for e in ELECTRs]
links_ls = [{"DOI": doi} if doi else {"ELECTR": electr} if electr else '' for doi, electr in zip(DOI_ls, ELECTR_ls)]
abs_ls = [a if a != 'nan' else '' for a in abstracts]

# ZIP TOGETHER RECORDS
records = []
for bib, subcat, auth, title, pubdate, pub, links, abstract in zip(bibcodes_ls, subcats_ls, auth_ls, title_ls, pubdates_ls, pubs_ls, links_ls, abs_ls):
    authors = auth.split("; ")
    records.append({
                    "bibcode":bib,
                    "authors":authors,
                    "pubdate":pubdate,
                    "title":title,
                    "publication":pub,
                    "properties":links,
                    "abstract":abstract,
                    "database":collection,
                    "subjectcategory":subcat,
                    "source":"ADS"})

for r in records:
    html.unescape(r)
    
# SAVE JSON FILE
with open(path + json_output, 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as",json_output)

# Pyingest Serializer - Transform json into tagged format
f = open(path + json_output)
json_file = json.load(f)
outputfp = open(path + tagged, 'a')
for record in json_file:
    serializer = Tagged()
    serializer.write(record, outputfp)
    print(record,'\n')
outputfp.close()