# STI Curation

NOTEBOOK INPUT:
1. Set file names
2. Choose process to run for records: new ingests or metadata updates

NOTEBOOK OUTPUT:
- json file and tagged file of records

In [None]:
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import numpy as np
import re
import csv
import unicodedata
from openpyxl import load_workbook
import html
from pyingest.serializers.classic import Tagged


# -- Set name of output file (date_category)
name = "2306AST"

# -- Set local filepath to save output files
filepath = "/Users/sao/Documents/Python-Projects/STI/"

# --- SELECT PROCESS TO RUN
# sheet = "metadata_updates"
sheet = "ingest_new"

# --- SELECT COLLECTION
  
# collection = "AST"     # ASTRONOMY
# collection = "PHY"      # PHYSICS
# collection = ""         # EARTH SCIENCE
# subcat = "Earth Science"


In [None]:
# FILE INPUT
input_filename = name + "_STIreview.xlsx"
path = filepath + input_filename

# FILE OUTPUT
if sheet == "ingest_new":
    outfilename = name + "_STIingest"
if sheet == "metadata_updates":
    outfilename = name + "_STIupdates"
json_output = outfilename + ".json"
tagged = outfilename + ".tag"

# READ FILE AND GET METADATA
dt = pd.read_excel(path, sheet_name=sheet)
dt = pd.DataFrame(dt)

if "bibcode" not in dt.columns:
    dt["bibcode"] = np.nan

bibcodes = dt["bibcode"].astype(str)
authors = dt["authors"].astype(str)
affiliations = dt["affiliations"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
pubs = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
links = dt["properties"].astype(str)
subcats = dt["STI subject categories"].astype(str)
collections = dt["collection"].astype(str)
keywords = dt["keywords"].astype(str)

lsR = [b if b != 'nan' else '' for b in bibcodes]
lsA = [a if a != 'nan' else '' for a in authors]
lsF = [a if a != 'nan' else '' for a in affiliations]
lsT = [t if t != 'nan' else '' for t in titles]
lsD = [d.replace('.0','') if d != 'nan' else '' for d in pubdates]
lsJ = [p if p != 'nan' else '' for p in pubs]
lsI = [link if link != 'nan' else '' for link in links]
lsB = [a if a != 'nan' else '' for a in abstracts]
lsQ = [s if s != 'nan' else '' for s in subcats]
lsW = [w if w != 'nan' else '' for w in collections]
lsK = [k if k != 'nan' else '' for k in keywords]

# ZIP TOGETHER RECORDS
records = []
for R, A, F, T, D, J, I, B, Q, W, K in zip(lsR, lsA, lsF, lsT, lsD, lsJ, lsI, lsB, lsQ, lsW, lsK):
    authors = A.split("; ")
    affs = F.split("; ")
    records.append({
                    "bibcode": R,
                    "authors": authors,
                    "affiliations": affs,
                    "pubdate": D,
                    "title": T,
                    "publication": J,
                    "properties": I,
                    "abstract": B,
                    "database": W,
                    "subjectcategory": Q,
                    "keywords": K,
                    "source":"ADS"})

for r in records:
    html.unescape(r)
    
# SAVE JSON FILE
with open(filepath + json_output, 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as",json_output)

# Pyingest Serializer - Transform json into tagged format
f = open(filepath + json_output)
json_file = json.load(f)
outputfp = open(filepath + tagged, 'a')
for record in json_file:
    serializer = Tagged()
    serializer.write(record, outputfp)
#     print(record,'\n')
outputfp.close()