# EarthArXiv Harvester 

In [None]:
from habanero import Crossref
import json
from datetime import datetime
from pyingest.serializers.classic import Tagged
import pandas as pd
import ast

cr = Crossref()
doi_prefix = '10.31223'
res = cr.prefixes(ids = doi_prefix, works = True, cursor = "*", limit = 200)

In [None]:
records = []
for entry in res:
    for item in entry['message']['items']:

        def format_authors(authors_data):
            formatted_authors = []
            for author in authors_data:
                last_name = author.get('family', '')
                first_name = author.get('given', '')
                if last_name and first_name:
                    formatted_authors.append(f"{last_name}, {first_name}")
                elif last_name and not first_name:
                    formatted_authors.append(last_name)
            return "; ".join(formatted_authors)
        
        def format_affs(authors_data):
            formatted_affs = []
            for author in authors_data:
                aff = author.get('affiliation', '')
                orcid = author.get('ORCID', '').lstrip("http://orcid.org/")
                formatted_aff = ""
                if aff:
                    formatted_aff = f'{aff}'
                if orcid:
                    formatted_aff += f'<ID system=\"ORCID\">{orcid}</ID>'
                formatted_affs.append(formatted_aff)
            return "; ".join(formatted_affs)

        authors_data = item.get('author', [])
        authors = format_authors(authors_data)
        affiliations = format_affs(authors_data)
        title = item.get('title', '')[0]
        group_title = item.get('group-title', '')
        abstract = item.get('abstract', '').replace("<jats:p>", "").replace("</jats:p>", "")
        preprint_doi = item.get('DOI', '')
        url = item.get('resource', {}).get('primary', {}).get('URL', '')
        
        links = ""
        if preprint_doi:
            links += f"DOI: {preprint_doi}"
        if url:
            links += f"; ELECTR: {url}"
        links = links.lstrip("; ").rstrip("/")
        
        pubdate = ""
        if "published" in item and "date-parts" in item["published"]:
            date_parts = item["published"]["date-parts"]
            if date_parts:
                year, month, day = date_parts[0]
                pubdate = f"{year}/{month:02d}/{day:02d}"
            
        article_doi = ""
        if "relation" in item and "is-preprint-of" in item["relation"]:
            is_preprint_of = item["relation"]["is-preprint-of"]
            if isinstance(is_preprint_of, list) and len(is_preprint_of) > 0:
                article_doi = (is_preprint_of[0]["id"]).lstrip("https://doi.org/")  
    
        if title != "":
            r = {
                "authors": "; ".join(authors.split("; ")),
                "affiliations": "; ".join(affiliations.split("; ")),
                "pubdate":pubdate,
                "title": title,
                "properties": links,
                "abstract": abstract,
                "keywords": group_title,
                "preprint_doi": preprint_doi,
                "article_doi": article_doi,
                "source": "CrossRef"
            }
            records.append(r)

## Curation: Add new records to data file

In [None]:
# Read master file from excel
excel_file = "eartharxiv_data.xlsx"
master_file = pd.read_excel(excel_file)

# Create a set of preprint_dois were already harvested
exclusions = set(master_file['preprint_doi'].tolist())

# Create a list of the new records based on new preprint_dois
records_to_add = []
for r in records:
    if r["preprint_doi"] not in exclusions:
        records_to_add.append(r)

# Create a DataFrame from the new_records
new_records_df = pd.DataFrame(records_to_add)
original_df = pd.DataFrame(master_file)

# Merge the new_records_df with the master_file to append the new records
merged_df = pd.concat([original_df, new_records_df], ignore_index=True)

# Write the merged DataFrame back to the master excel file
merged_df.to_excel(excel_file, index=False)
print(f"Added {len(records_to_add)} records to {excel_file}")

## Curation: Generate tagged format records

In [None]:
# READ DATA FROM EXCEL
master_file = pd.read_excel("eartharxiv_data.xlsx")
dt = pd.DataFrame(master_file)

json_output = "eartharxiv.json"
tagged_output = "eartharxiv.tag"
  
# bibcodes = dt["bibcode"].astype(str)
authors = dt["authors"].astype(str)
affiliations = dt["affiliations"].astype(str)
titles = dt["title"].astype(str)
pubdates = dt["pubdate"].astype(str)
abstracts = dt["abstract"].astype(str)
links = dt["properties"].astype(str)
keywords = dt["keywords"].astype(str)

# lsR = [b if b != 'nan' else '' for b in bibcodes]
lsA = [a if a != 'nan' else '' for a in authors]
lsF = [a if a != 'nan' else '' for a in affiliations]
lsT = [t if t != 'nan' else '' for t in titles]
lsD = [d.replace('.0','') if d != 'nan' else '' for d in pubdates]
lsI = [link if link != 'nan' else '' for link in links]
lsB = [a if a != 'nan' else '' for a in abstracts]
lsK = [k if k != 'nan' else '' for k in keywords]

# ZIP TOGETHER RECORDS
records = []
for A, F, T, D, I, B, K in zip(lsA, lsF, lsT, lsD, lsI, lsB, lsK):
#     if R == "...................":
    records.append({
                    "bibcode": "",
                    "authors": A.split("; "),
                    "affiliations": F.split("; "),
                    "pubdate": D,
                    "title": T,
                    "properties": I,
                    "abstract": B,
                    "keywords": K,
                    "source":"CrossRef"})

# for r in records:
#     html.unescape(r)
    
# SAVE JSON FILE
with open(json_output, 'w') as outfile:
    json.dump(records, outfile)
print(f"Saved {len(records)} records as {json_output}")

# Pyingest Serializer - Transform json into tagged format
f = open(json_output)
json_file = json.load(f)
outputfp = open(tagged_output, 'a')
for record in json_file:
    serializer = Tagged()
    serializer.write(record, outputfp)
#     print(record,'\n')
print(f"Saved {len(records)} records as {tagged_output}")

## Additional resolver 

In [None]:
# # Read the Excel file into a DataFrame
# master_file = pd.read_excel("eartharxiv_data.xlsx")
# dt = pd.DataFrame(master_file)

# # Get rows with a bibcode value
# rows_with_bibcode = dt[dt['bibcode'] != '...................']
# count_with_bibcode = len(rows_with_bibcode)
# print(f"Rows with a bibcode: {count_with_bibcode}")

# # Get rows with no bibcode
# rows_with_no_bibcode = dt[dt['bibcode'] == '...................']
# count_no_bibcode = len(rows_with_no_bibcode)
# print(f"Rows with no bibcode: {count_no_bibcode}")

In [None]:
# # Initialize an empty list for references
# ref_list = []

# # Iterate through rows with no bibcode
# for index, row in rows_with_no_bibcode.iterrows():
#     T = row["title"]
#     A = row["authors"]
#     doi = row["preprint_doi"]
    
#     if A and T and doi:
#         ref = {
#             "refstr": f"{A}, {T}, {doi}",
#             "authors": A,
#             "title": T,
#             "doi": doi
#         }
#     elif doi:
#         ref = {
#             "refstr": f"{doi}",
#             "doi": doi
#         }
#     else:
#         ref = {"refstr":""}
#     ref_string = json.dumps(ref, ensure_ascii=False)
#     ref_list.append(ref_string)

# # Reference Service API request, querying my 'references' list
# # ADS Prod API Token
# token = 'pHazHxvHjPVPAcotvj7DIijROZXUjG5vXa2OaCQO'
# domain = 'https://api.adsabs.harvard.edu/v1/'
# def resolve(references):
#     payload = {'parsed_reference': references}
#     response = requests.post(
#         url = domain + 'reference/xml',
#         headers = {'Authorization': 'Bearer ' + token,
#                  'Content-Type': 'application/json',
#                  'Accept':'application/json'},
#         data = json.dumps(payload))
#     if response.status_code == 200:
#         return json.loads(response.content)['resolved'], 200
#     else:
#         print('From reference status_code is', response.status_code)
#         return None, response.status_code

# # Resolve my references, results in 'total results' list
# references = [json.loads(ref) for ref in ref_list]
# total_results = []
# print('Querying %d references with the Reference Service ...'%len(references))
# for i in range(0, len(references), 16):
#     results, status = resolve(references[i:i+16])
#     if results:
#         total_results += results

# # Save the results to excel
# df = pd.DataFrame(total_results)
# df.to_excel("ref_review.xlsx", index=False)
# print("Saved ref results to ref_review.xlsx")