# AGU Harvester Notebook - Version 2

Input:
- Meeting code
- Publication/meeting info further down in Metadata Transformation phase

Outfiles:
- AGU Abstracts as (agu_results.xlsx)
- AGU Author Affiliations/Roles as individual json files in directory "json_roles"
- Merged data as (agu_final_data.xlsx)
- Json file for ingest as (agu_final_data.json)

In [None]:
import sys
import requests
import math
import json
import pandas as pd
import sys, os, io
import argparse
import csv
import time
import datetime
import numpy as np
from numpy import nan
import re
import unicodedata

# Get AGU Meeting Abstracts, Author Roles w/ Affiliations

In [None]:
# Get meeting papers -- just change meeting code
meetingcode = "fm21"

# ----------------------------------------------
os.chdir('/Users/sao/Documents/Python-Projects/AGU/version2')
AGUresults = "agu_results_v2.xlsx"
if os.path.exists(AGUresults):
    paper_results = pd.read_excel(AGUresults)
    print('Read',len(paper_results),'papers from file')
    
    dt = paper_results.explode("ChildList_Role")
    roles = [eval(l) for l in dt['ChildList_Role'].to_list()]
        # Extract role IDs from data, resulting in list of lists
        # The function 'eval' does this: 
        # Transform
        #  ["['Role/4501570', 'Role/4503884']","['Role/4191734', 'Role/4191863']"] 
        # into
        # [['Role/4501570', 'Role/4503884'],['Role/4191734', 'Role/4191863']]
        # Initially the result is not a proper list of lists, but a list of strings
        # that need to be turned into proper lists.
        # Now we need to turn the list of lists into just one big list ("flatten" the list)
    roles = [item for sublist in roles for item in sublist]
    roles = list(set(roles))
    print('Read',len(roles),'author roles from file')
else:
    # API Query for Paper data
    domain = "https://agu.confex.com/agu/meetingapi.cgi/Paper"
    AGU_API = domain[:27] + meetingcode + '/' + domain[27:]
    data = requests.get(AGU_API).json()
    print('Got',len(data),'papers from',meetingcode)

    # Extract specified metadata from results
    paper_results = pd.json_normalize(data)
    paper_results = pd.DataFrame(paper_results)
    paper_results = paper_results[[
        "Abstract",
        "Title",
        "ChildList_Role",
        "FinalPaperNumber",
        "_url",
        "Withdrawn",
        "GoodType"
    ]]

    # Drop rows where GoodType = Break (these aren't abstracts); Drop withdrawn papers
    paper_results = paper_results[paper_results.GoodType != 'Break']
    paper_results = paper_results[paper_results.Withdrawn != 'w']
    paper_results = paper_results.drop('Withdrawn',axis=1)

    # Save excel file of results
    paper_results.to_excel(AGUresults, index=False)
    print('Refined results and saved',len(paper_results),'papers as \'agu_results_v2.xlsx\'')

    # Prepare roles list to query API for roles/affiliations
    # Role IDs to role_list and deduplicate
    dt = paper_results.explode("ChildList_Role")
    roles = dt['ChildList_Role'].to_list()
    roles = list(set(roles))
    roles = [item for item in roles if not(pd.isnull(item)) == True]
    print('Extracted', len(roles),'author roles to query')

# API Query for Role/Author data
domain = "https://agu.confex.com/agu/meetingapi.cgi/"
AGU_API = domain[:27] + meetingcode + '/' + domain[27:]

if os.path.exists("json_roles"):
    os.chdir("json_roles")
else:
    os.mkdir("json_roles")
    os.chdir("json_roles")
    
print('Started role requests at',datetime.datetime.now(),'...\n')
file_counter = 0
# for each roleID in the papers list ("roles"), if a json file doesn't yet exist,
# send an API request for that roleID, and make a new json file, labeling it by ID number
for ident in roles:
    file = "{}.json"
    if os.path.exists(file.format(ident[5:])):
#         print("File",ident,"already exists")
        file_counter += 1

    else:
        AGU_AUTHORS = AGU_API + str(ident)
        try:
            data = requests.get(AGU_AUTHORS).json()
            new_row = {
            "RoleID":data["_url"],
            "Role":data["Role"],
            "PaperID":data["Parent_Entry"],
            "Position":data["Priority"],
            "FirstName":data["Person_FirstName"],
            "LastName":data["Person_LastName"],
            "Affiliation":data["Person_Affiliation"],
            "City":data["Person_City"],
            "Country":data["Person_Country"],
            "ORCID":data["Person_ORCIDiD"]
            }
            
            with open(file.format(ident[5:]),"w") as outfile:
                json.dump(new_row, outfile)
#             print('Harvest success for',ident)
            file_counter += 1
        
        except Exception as err:
            print("Harvesting failed for {0} with the following reason: {1}".format(ident, err))

        
print('Finished role requests at',datetime.datetime.now())
print('Retrieved',file_counter,'author roles \n')


## Convert json files to excel

In [None]:
path_to_json = "/Users/sao/Documents/Python-Projects/AGU/version2/json_roles/"
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

# here I define my pandas Dataframe with the columns I want to get from the json
role_results = pd.DataFrame(columns=['RoleID','Role','PaperID','Position','FirstName',
                                     'LastName','Affiliation','City','Country','ORCID'])

# we need both the json and an index number so use enumerate()
for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = json.load(json_file)

        # here you need to know the layout of your json and each json has to have
        # the same structure (obviously not the structure I have here)
        roleID = json_text['RoleID']
        role = json_text['Role']
        paperID = json_text['PaperID']
        position = json_text['Position']
        firstname = json_text['FirstName']
        lastname = json_text['LastName']
        affiliation = json_text['Affiliation']
        city = json_text['City']
        country = json_text['Country']
        orcid = json_text['ORCID']
        
        # here I push a list of data into a pandas DataFrame at row given by 'index'
        role_results.loc[index] = [roleID, role, paperID, position, firstname, lastname, affiliation, city, country, orcid]                      

                            
# Save master excel file of role results
role_results.to_excel("role_results_v2.xlsx", index=False)
print("Saved author affiliation results as \'role_results.xlsx\'")
                       
                            

# Metadata Transformation and Curation

In [None]:
# Read abstracts and role results from excel
paper_results = pd.read_excel("/Users/sao/Documents/Python-Projects/AGU/version2/agu_results_v2.xlsx")
aff_results = pd.read_excel("/Users/sao/Documents/Python-Projects/AGU/version2/role_results_v2.xlsx")
papers = pd.DataFrame(paper_results)
affs = pd.DataFrame(aff_results)

# Method to remove HTML tags
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result

# Format Abstracts
papers["Abstract"] = papers["Abstract"].astype(str)
papers["Abstract"] = papers["Abstract"].replace({'_x000D_\n\t_x000D_\n_x000D_\n':''}, regex=True)
papers["Abstract"] = papers["Abstract"].replace({'_x000D_\n_x000D_\n':''}, regex=True)
papers["Abstract"] = papers["Abstract"].replace({'_x000D_\n\t':''}, regex=True)
papers["Abstract"] = papers["Abstract"].apply(lambda cw : remove_tags(cw))

# Format Titles
papers["Title"] = papers["Title"].apply(lambda cw : remove_tags(cw))
papers["Title"] = papers["Title"].replace({'_x000D_\n':''}, regex=True)

# Format Publications -- *Change Meeting Info Here* --->
papers["Pub"] = "AGU Fall Meeting 2021, held in New Orleans, LA, 13-17 December 2021, id. " + papers["FinalPaperNumber"] + "."

# Format Authors
affs["Author"] = affs["LastName"] + ", " + affs["FirstName"]

# Format full affiliations (Aff, city, country, ORCID)
affs["ORCIDs"] = str('<ORCID>') + affs["ORCID"] + str('</ORCID>')
affs["Aff_full"] = (
    affs["Affiliation"] + ", " + 
    affs["City"] + ", " + 
    affs["Country"] + " " + 
    affs["ORCIDs"]
)
affs["Aff_full"] = affs["Aff_full"].fillna(
    affs["Affiliation"] + ", " + 
    affs["City"] + ", " + 
    affs["Country"]
)

# Sort author/affiliation list by Paper, and then Position; Sort paper list by PaperID
affs = affs.sort_values(by=['PaperID', 'Position'])
papers = papers.sort_values(by=['_url'])

# Select needed columns as new dataframes
affs_data = affs[['Author','Aff_full','PaperID','Position']]
papers_data = papers[['Abstract','Title','Pub','_url']]

# Aggregate Authors and Affiliations by PaperID
affs_data = affs_data.replace(np.nan,'NA')
affs_data = affs_data.groupby(["PaperID"]).agg(Authors=("Author", "; ".join),Aff_full=("Aff_full", "; ".join))

# Merge papers and authors/affils by PaperID
papers_data.rename(columns = {'_url':'PaperID'}, inplace = True)
merged = pd.merge(papers_data, affs_data, how='left', on='PaperID')


In [None]:
# Identify columns from merged data to take metadata from
authors = merged["Authors"].astype(str)
affs = merged["Aff_full"].astype(str)
abstracts = merged["Abstract"].astype(str)
titles = merged["Title"].astype(str)
pubs = merged["Pub"].astype(str)
paperIDs = merged["PaperID"].astype(str)

# Get author names to list
authors_list = []
for a in authors:
    if a != 'NA':
        author = unicodedata.normalize('NFD', a).encode('ascii', 'ignore').decode()
        authors_list.append(author)
    else:
        authors_list.append('')

# Get author affiliations to list
affs_list = []
for a in affs:
    if a != 'nan':
        affil = unicodedata.normalize('NFD', a).encode('ascii', 'ignore').decode()
        affs_list.append(affil)
    else:
        affs_list.append('')
        
# Get abstracts to list
abs_list = []
for a in abstracts:
    if a != 'nan':
        abstract = unicodedata.normalize('NFD', a).encode('ascii', 'ignore').decode()
        abstract = abstract.replace("\n","  ")
        abs_list.append(abstract)
    else:
        abs_list.append('')

# Get titles to list
titles_list = []
for t in titles:
    if t != 'nan':
        title = unicodedata.normalize('NFD', t).encode('ascii', 'ignore').decode()
        titles_list.append(title)
    else:
        titles_list.append('')

# Get publications to list
pubs_list = []
for p in pubs:
    if t != 'nan':
        pub = unicodedata.normalize('NFD', p).encode('ascii', 'ignore').decode()
        pubs_list.append(pub)
    else:
        pubs_list.append('')

# Package metadata as json records
records = []
records_counter = 0
for auths, affs, title, pub, abstract in zip(authors_list, affs_list, titles_list, pubs_list, abs_list):
    authors = auths.split("; ")
    affils = affs.split("; ")
    records.append({"authors":authors,
                    "affiliations":affils,
                    "pubdate":"12/2021",
                    "title":title,
                    "publication":pub,
                    "abstract":abstract,
                    "source":"ADS"})
    records_counter += 1
    
# Save json file of data
with open("agu_final_data_v2.json", 'w') as outfile:
    json.dump(records, outfile)
print("Saved",records_counter,"records as agu_final_data_v2.json")

# Save excel file of data
agu_final_papers = pd.json_normalize(records)
agu_final_papers.to_excel("agu_final_data_v2.xlsx", index=False)
print("Saved",records_counter,"records as agu_final_papers_v2.xlsx")