# STI Curation

INPUT:
- Set file names

OUTPUT:
- json file and ADS tagged file of records

In [None]:
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import numpy as np
import re
import csv
import unicodedata
from openpyxl import load_workbook
import html
from pyingest.serializers.classic import Tagged

# -- Set name of output file (year)
name = "2023"

# -- Set local filepath to save output files
filepath = "/Users/sao/Documents/Python-Projects/NASA_STI/data/"

In [None]:
# FILE INPUTS
input_filename = name + "_data.xlsx"
path = filepath + input_filename

# FILE OUTPUTS
outfilename = name + "_STIingest"
json_output = outfilename + ".json"
tagged = outfilename + ".tag"

# READ FILE AND GET METADATA
dt = pd.read_excel(path)
dt = pd.DataFrame(dt)
  
bibcodes = dt["bibcode"].astype(str)
authors = dt["authors"].astype(str)
affiliations = dt["affiliations"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
pubs = dt["publication"].astype(str)
abstracts = dt["abstract"].astype(str)
links = dt["properties"].astype(str)
subcats = dt["subjectcategory"].astype(str)
collections = dt["collection"].astype(str)
keywords = dt["keywords"].astype(str)
comments = dt["comment"].astype(str)

lsR = [b if b != 'nan' else '' for b in bibcodes]
lsA = [a if a != 'nan' else '' for a in authors]
lsF = [a if a != 'nan' else '' for a in affiliations]
lsT = [t if t != 'nan' else '' for t in titles]
lsD = [d.replace('.0','') if d != 'nan' else '' for d in pubdates]
lsJ = [p if p != 'nan' else '' for p in pubs]
lsI = [link if link != 'nan' else '' for link in links]
lsB = [a if a != 'nan' else '' for a in abstracts]
lsQ = [s if s != 'nan' else '' for s in subcats]
lsW = [w if w != 'nan' else '' for w in collections]
lsK = [k if k != 'nan' else '' for k in keywords]
lsX = [x if x != 'nan' else '' for x in comments]

extract_ISSNs = re.compile(r"\sISSN\W+(\S*)")
extract_eISSNs = re.compile(r"\seISSN\W+(\S*)")

ISSNs = [extract_ISSNs.findall(i)[0] if extract_ISSNs.findall(i) else '' for i in lsJ]
eISSNs = [extract_eISSNs.findall(i)[0] if extract_eISSNs.findall(i) else '' for i in lsJ]

# ZIP TOGETHER RECORDS
records = []
for R, A, F, T, D, J, I, B, Q, W, K, X, issn, eissn in zip(lsR, lsA, lsF, lsT, lsD, lsJ, lsI, lsB, lsQ, lsW, lsK, lsX, ISSNs, eISSNs):
    if R == "...................":
        records.append({
                        "bibcode": "",
                        "authors": A.split("; "),
                        "affiliations": F.split("; "),
                        "pubdate": D,
                        "title": T,
                        "publication": J,
                        #"ISSN": issn,  # include ISSN 
                        #"eISSN": eissn, # include eISSN
                        "properties": I,
                        "abstract": B,
                        "database": W,
                        "subjectcategory": Q,
                        "keywords": K,
                        "comments": X,
                        "source":"ADS"})

for r in records:
    html.unescape(r)

# SAVE JSON FILE
filepath = "/Users/sao/Documents/Python-Projects/NASA_STI/tagged/"
with open(filepath + json_output, 'w') as outfile:
    json.dump(records, outfile)
print("Saved",len(records),"records as",json_output)

# Pyingest Serializer - Transform json into tagged format
f = open(filepath + json_output)
json_file = json.load(f)
outputfp = open(filepath + tagged, 'a')
for record in json_file:
    serializer = Tagged()
    serializer.write(record, outputfp)
#     print(record,'\n')
outputfp.close()
print("Saved",len(records),"records as",tagged)

In [None]:
# # Include ntrs link for existing ADS records (matched)
# name = "2023"
# filepath = "/Users/sao/Documents/Python-Projects/STI/data/"
# input_filename = name + "_STIreview.xlsx"
# path = filepath + input_filename

# # Load the data from the Excel files into DataFrames
# ref_results_df = pd.read_excel(path, sheet_name='ref_results')
# sti_output_df = pd.read_excel(path, sheet_name='sti_output')

# # Create an empty list to store the matched data
# ads_records = []

# # Iterate through each row in 'ref_results' DataFrame
# for index, row in ref_results_df.iterrows():
#     refstring = row['refstring']
#     bibcode = row['bibcode']
    
#     # Check if the 'bibcode' is not NaN and the 'refstring' value can be found in 'sti_output' DataFrame
#     if not pd.isna(bibcode):
#         matched_row = sti_output_df[sti_output_df['publications'].str.contains(refstring, na=False)]
    
#         if not matched_row.empty:
#             # If there's a match, grab the 'id' and 'bibcode' values and add them to the list
#             ident = matched_row.iloc[0]['id']
#             ads_record = f'%R {bibcode}\n%I ELECTR: https://ntrs.nasa.gov/citations/{ident}\n'
#             ads_records.append(ads_record)

# # Join all the ADS records into a single string
# ads_records_str = '\n'.join(ads_records)

# # Save the ADS records to a text file
# with open('ads_records.txt', 'w') as file:
#     file.write(ads_records_str)
