In [1]:
### PREAMBLE: LOADING NECESSARY PYTHON PACKAGES
import pandas as pd
import requests
import sys
import json
import time
import re
import numpy as np
import os 

from tqdm.notebook import tqdm
from datetime import date

In [155]:
### PREAMBLE: LOADING LOCAL ENVIRONMENT VARIABLES (API KEYS) Accessing secure API keys
keys_json = json.load(open("env_keys.json"))
scopus_key = keys_json['scopus_key']
aditya_key = keys_json['aditya_imperial_scopus_key']
thomas_key = keys_json['thomas_chicago_scopus_key']

In [159]:
### SCOPUS API REQUEST HEADERS
req_headers = {
    # 'X-ELS-APIKey' : scopus_key
     'X-ELS-APIKey' : thomas_key
    #  'X-ELS-APIKey' : aditya_key
}

In [81]:
# This is a function that helps to fill out article-level observations from the API response without throwing too many failures or blowing up the whole process without getting at least the most important information. It is also used to collect as much author/abstract/reference information as possible without killing the whole process.
def field_population(temp_df, json_obj, navigation_dict, field, optional_dict_modification=0):
    # This is the json object that we want to explore. It could be an article- or an author-observation, for example.
    json_object = json_obj
    # print(json_object)
    if field == list(navigation_dict.keys())[0]:
        row_to_enter_info = len(temp_df)
    else:
        row_to_enter_info = len(temp_df)-1

    # If a single affiliation has multiple authors associated with it, the structure that we used before has to be inverted. This is a cludgey way to get around that problem
    if optional_dict_modification != 0:
        if re.search(r'\[\'author\'\]\[0\]', navigation_dict[field], re.I):
            # Change navigation pattern
            original_navigation_string = '[\'author\'][0]'
            modified_navigation_string = '[\'author\'][{}]'.format(optional_dict_modification)
            modified_navigation_pattern = r'\[\'author\'\]\[{}\]'.format(optional_dict_modification)
            navigation_dict[field] = re.sub(r'\[\'author\'\]\[0\]', modified_navigation_string, navigation_dict[field])

            exec('temp_df.loc[{}, "{}"] = json_object{}'.format(row_to_enter_info, field, navigation_dict[field]))
            #reset navigation pattern
            # navigation_dict[field] = re.sub(modified_navigation_pattern, original_navigation_string, navigation_dict[field])

    try:
        ## We have only a small number of special cases so that the general function holds as much as possible:
        if field == 'sc_author_affil_id':
            # SOMETIMES AN AUTHOR HAS MULTIPLE AFFILIATIONS (EG NBER AND HARVARD). IN THIS CASE, WE CONCATENATE ALL OF THE AFFILIATIONS, DELIMITED BY '+' SYMBOLS AND STORE THEM AS A STRING (TO BE UNNESTED LATER)
            affiliation_id_list_of_dicts  = json_object['affiliation']['affiliation-id']
            if type(affiliation_id_list_of_dicts) == list:
                multi_affil_list = []
                for affil in affiliation_id_list_of_dicts:
                    multi_affil_list.append(affil['@afid'])
                sc_author_affil_id = '+'.join(multi_affil_list)
                temp_df.loc[optional_dict_modification, field] = sc_author_affil_id
            else:
                exec('temp_df.loc[{}, "{}"] = json_object{}'.format(row_to_enter_info, field, navigation_dict[field]))
        elif field == 'sc_funding_agency':
            ## SOMETIMES THERE ARE MULTIPLE FUNDING AGENCIES. IN THIS CASE, WE CONCATENATE ALL OF THOSE AGENCIES, DELIMITED BY '+' SYMBOLS AND STORE THEM AS A STRING. I THINK THIS IS OKAY BECAUSE IF THERE IS AN IDENTIFIED FUNDING AGENCY, SCOPUS HAS ALSO COLLECTED (AT LEAST PART OF) THE 'THANKS' FOOTNOTE THAT DISCLOSES IT
            funding_agencies_list_of_dicts  = json_object['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding']
            if type(json_object) == list:
                multi_agency_list = []
                for agency in multi_agency_list:
                    multi_agency_list.append(agency['xocs:funding-agency'])
                sc_funding_agency = '+'.join(multi_agency_list)
                temp_df.loc[optional_dict_modification, field] = sc_funding_agency
            else:
                exec('temp_df.loc[{}, "{}"] = json_object{}'.format(optional_dict_modification, field, navigation_dict[field]))
        else: 
            exec('temp_df.loc[{}, "{}"] = json_object{}'.format(optional_dict_modification, field, navigation_dict[field]))
    


    except Exception as e:

        # print("SCOPUS FAILURE ON: {}".format(field))
        # print(e)
        # print(traceback.format_exc())
        # print('------------------------------------------------------')
        temp_df.loc[optional_dict_modification, field] = 'SCOPUS FAILURE'

    return temp_df

In [5]:
# Given an issn and a year in which to begin, this function collects all articles from the SCOPUS Search API and returns a publication-specific df 
def publication_collect(issn:str, start_year:str, journal_name:str):
    # INSTANTIATE AN EMPTY DATAFRAME THAT WILL CONTAIN ALL OF THE RESULTS FOR THIS PUBLICATION
    pub_df = pd.DataFrame()


    #### FIRST WE NEED TO SEE HOW MANY ARTICLES WE NEED TO COLLECT FROM THIS PUBLICATION
    # How to construct a scopus query
    # Guide: https://dev.elsevier.com/sc_search_tips.html
    # Practice at: https://www.scopus.com/search/form.uri?display=advanced


    current_year = 2021
    human_query_issn = 'ISSN({issn_str})'.format(issn_str=issn)
    human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=start_year,
                                                                    current_year_str=current_year)    
    req_headers = {
        'X-ELS-APIKey' : scopus_key
    }

    prelim_query = {
        'httpAccept' : 'application/json',
        'query' : human_query_issn,
        'date' : human_query_date,
        'count' : '1',
        'cursor' : '*'
    }

    prelim_r = requests.get('https://api.elsevier.com/content/search/scopus',
                            headers=req_headers,
                            params=prelim_query)
    print("PRELIM {pub_name} API STATUS CODE: {status}".format(pub_name=journal_name, status=prelim_r.status_code))
    prelim_json = prelim_r.json()
    # print(prelim_json)
    article_count = int(prelim_json['search-results']['opensearch:totalResults'])
    print("PRELIM COUNT OF ARTICLES FOUND BY API: {}".format(article_count))
    print("-----------------------------------------------------")
    
    

    still_more_pages_to_call = True

    nth_call_counter = 1
    ### THE API WILL ONLY RETURN 200 ARTICLES AT TIME, SO WE NEED TO CREATE A WHILE-LOOP THAT PULLS 200 RESULTS AT A TIME AS LONG AS THE CURRENT RESPONSE LINKS TO A POTENTIAL 'next' PAGE.
    while still_more_pages_to_call:     
        call_df = pd.DataFrame()
        if nth_call_counter ==1: 
            print("\t Preparing to make call {} to {}".format(nth_call_counter, journal_name))
            nth_call_counter += 1
            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : '*' 
            }
        else:
            print("\t Preparing to make call {} to {}".format(nth_call_counter, journal_name))
            nth_call_counter += 1
            cursor_next_hash = r_json['cursor']['@next']

            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : cursor_next_hash
            }
        

        # print(call_query)

        # THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (200 AT A TIME BY USING 'cursor/@next')
        r = requests.get('https://api.elsevier.com/content/search/scopus',
                        headers=req_headers,
                        params=call_query)
        
        print("\t CALL {n} FOR {pub_name} RETURNING STATUS CODE: {code}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter - 1,
                                                                                    code=r.status_code))
        
        
        r_json = r.json()['search-results']

        if r_json['cursor']['@current'] == r_json['cursor']['@next']:
            print("---------------------------------------------")
            print("We have reached the end of the 'cursor-next' chain. breaking out of this pub")
            still_more_pages_to_call = False
            print("---------------------------------------------")
            print("---------------------------------------------")
            print("---------------------------------------------")
            break
        else:
            print("Going to finish collecting this page and then there still at least one more to go.")

        # NOW WE BEGIN UNPACKING EACH BATCH OF 200 TO STORE IN A TEMP DF
        sc_query_used = r_json['link'][0]['@href']
        r_results = r_json['entry']
        print("{pub_name} CALL {n} ARTICLES FOUND RESULTS: {num}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter,
                                                                                    num=len(r_results)))
        
        ## THIS DICTIONARY SHOULD BE STRUCTURED IN ORDER OF IMPORTANCE. IE. PULLING THE API ENDPOINT IS MORE IMPORTANT THAN PULLING THE DOI, WHICH IS MORE IMPORTANT THE TITLE ETC.
        article_obj_nav_dict = {
            'sc_abstract_api_endpoint' : '["link"][0]["@href"]',
            'doi' : "['prism:doi']",
            'sc_title' : "['dc:title']",
            'sc_issn' : "['prism:issn']",
            'sc_pub_name' : "['prism:publicationName']",
            'sc_pub_date' : "['prism:coverDate']",
            'sc_open_access_status' : "['openaccessFlag']",
            'sc_vol' : "['prism:volume']",
            'sc_issue' : "['prism:issueIdentifier']",
            'sc_page_range' : "['prism:pageRange']",
            'sc_human_url' : "['link'][3]['@href']"
        }

        for article_obj in r_results:
            # CONSTRUCT A TEMP_DF THAT CONTAINS A SINGLE ARTICLE THAT WILL BE APPENDED TO call_df
            temp_df = pd.DataFrame({
                'doi' : [None],
                'sc_title' : [None],
                'sc_issn' : [None],
                'sc_pub_name' : [None],
                'sc_vol' : [None],
                'sc_issue' : [None],
                'sc_page_range' : [None],
                'sc_abstract_api_endpoint' : [None],
                'sc_human_url' : [None]
            })
            for field in article_obj_nav_dict:
                # FOR EACH OF THE FIELDS IDENTIFIED ABOVE (article_obj_nav_dict), EXECUTE THE field population FUNCTION THAT WILL TRY TO ACCESS THAT FIELD ACCORDING TO THE GIVEN SUBSCRIPT. IF UNAVAILABLE, CONTINUE WITH NOTATION OF FAILURE
                temp_df = field_population(temp_df, article_obj, article_obj_nav_dict, field)

            # ADD THE ARTICLE TO call_df
            # print(temp_df)
            call_df = pd.concat([call_df, temp_df], ignore_index=True)
            

        #ADD TO THE call_df THE CONSTRUCTED API-ENDPOINT QUERY THAT WAS USED TO GENERATE ALL OF THE RESULTS (ARTICLES) FOR THIS call_df
        call_df['sc_query_used'] = sc_query_used
        print("\t Number of articles collected on this call : {}".format(len(call_df)))

        # ADD THE 200-BATCH OF ARTICLES TO pub_df
        pub_df = pd.concat([pub_df, call_df], ignore_index=True)
        print("\t Number of articles cumulative collected for {}: {}".format(journal_name, len(pub_df)))

        ### TIMER HERE TO ENSURE WE DON'T EXCEED THE SCOPUS API'S QUERY THROTTLE
        time.sleep(0.15)
        print("---------------------------------------------")


    return pub_df

In [160]:
collection_dict = {
    # 'AER' : {
    #     'issn' : '00028282',
    #     'start_year' : '1990',
    #     'print_name' : 'American Economic Review'
    # },
    # 'ECA' : {
    #     'issn' : '00129682',
    #     'start_year' : '1990',
    #     'print_name' : 'Econometrica'
    # },
    # 'JPE' : {
    #     'issn' : '00223808',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Political Economy'
    # },
    # 'QJE' : {
    #     'issn' : '00335533',
    #     'start_year' : '1990',
    #     'print_name' : 'Quarterly Journal of Economics'
    # }, 
    # 'RES' : {
    #     'issn' : '00346527',
    #     'start_year' : '1990',
    #     'print_name' : 'Review of Economic Studies' 
    # },
    # 'RJE' : {
    #     'issn' : '07416261',
    #     'start_year' : '1990',
    #     'print_name' : 'RAND Journal of Economics'
    # },
    # 'JOF' : {
    #     'issn' : '00221082',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Finance' 
    # },
    'JFE' : {
        'issn' : '0304405X',
        'start_year' : '1990',
        'print_name' : 'Journal of Financial Economics' 
    },
    'RFS' : {
        'issn' : '08939454',
        'start_year' : '1990',
        'print_name' : 'Review of Financial Studies' 
    },
    'JEM' : {
        'issn' : '15309134',
        'start_year' : '1990',
        'print_name' : 'Journal of Economics and Management Strategy' 
    },
    # 'ALJ' : {
    #     'issn' : '00036056',
    #     'start_year' : '1990',
    #     'print_name' : 'Antitrust Law Journal' 
    # },
    # 'JLE' : {
    #     'issn' : '00222186',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Law and Economics' 
    # },
    # 'JLO' : {
    #     'issn' : '87566222',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Law, Economics, and Organization' 
    # },
    # 'JOL' : {
    #     'issn' : '0734306X',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Labor Economics' 
    # },
    # 'JHR' : {
    #     'issn' : '0022166X',
    #     'start_year' : '1990',
    #     'print_name' : 'Journal of Human Resources' 
    # },
    # 'ATB' : {
    #     'issn' : '0003603X',
    #     'start_year' : '1990',
    #     'print_name' : 'Antitrust Bulletin'
    # }
}
# This is the dictionary that we are going to use to store and access the publication-specific dfs. We will programtically generate it based on the collection order (see above dict)
# pub_dict = {}


In [162]:
pub_dict.keys()

dict_keys(['ECA', 'JPE', 'QJE', 'RES', 'RJE', 'JOF', 'JFE', 'AER', 'ALJ', 'JLE', 'JLO', 'JOL', 'JHR', 'ATB'])

In [163]:
#### MASTER RUN BLOCK HERE

for pub in collection_dict.keys():
    pub_dict[pub] = {}
    pub_dict[pub]['{}_core_df'.format(pub)] = None
    pub_dict[pub]['{}_author_abstract_df'.format(pub)] = None
    pub_dict[pub]['{}_cites_df'.format(pub)] = None



for pub in collection_dict.keys():


    ##### 1. FIND ALL OF THE ARTICLES PUBLISHED IN THE JOURNALS OF INTEREST

    pub_code = pub
    pub_issn = collection_dict.get(pub).get('issn')
    pub_start_year = collection_dict.get(pub).get('start_year')
    pub_name = collection_dict.get(pub).get('print_name')

    ## These are the lines of code that actually do things
    # exec('{}_core_df = publication_collect("{}", "{}","{}")'.format(pub_code, pub_issn, pub_start_year, pub_name))
    # exec('pub_dict["{}"]["{}_core_df"] = {}_core_df'.format(pub_code,pub_code, pub_code))
    # exec('{}_core_df.to_csv("scopus_data/{}_scopus_core.csv",index=False, encoding="utf-8")'.format(pub_code, pub_code))


    ### In the case that the core_dfs have already been downloaded turn this chunk on and turn chunk 1 off.
    print(pub_name)
    pub_dict[pub_code]['{}_core_df'.format(pub_code)] = pd.read_csv('scopus_data/{}_scopus_core.csv'.format(pub_code))
    print(len(pub_dict[pub_code]['{}_core_df'.format(pub_code)]))
    print('-------------------------------------------------')


    ##### 2. FIND THOSE ARTICLES' A) AUTHORS, ABSTRACTS, AND FUNDERS; AND B) PRIOR ARTICLES THAT THEY CITED
    exec('{}_author_abstract_funding_df, {}_cites_df = abstract_references_collect("{}")'.format(pub_code, pub_code, pub_code))
    exec('pub_dict["{}"]["{}_author_abstract_df"] = {}_author_abstract_funding_df'.format(pub_code,pub_code, pub_code))
    exec('{}_author_abstract_funding_df.to_csv("scopus_data/{}_author_abstract_funding.csv",index=False, encoding="utf-8")'.format(pub_code, pub_code))

    # exec('pub_dict["{}"]["{}_cites_df"] = {}_cites_df'.format(pub_code,pub_code, pub_code))
    # exec('{}_cites_df.to_csv("scopus_data/{}_cites.csv",index=False, encoding="utf-8")'.format(pub_code, pub_code))
    


Journal of Financial Economics
2873
-------------------------------------------------
Working on: https://api.elsevier.com/content/abstract/scopus_id/85125822152
found a duplicated author (i.e with multiple affiliations)
Working on: https://api.elsevier.com/content/abstract/scopus_id/85112538240
Working on: https://api.elsevier.com/content/abstract/scopus_id/85112045038
Working on: https://api.elsevier.com/content/abstract/scopus_id/85111544498
Working on: https://api.elsevier.com/content/abstract/scopus_id/85111027914
found a duplicated author (i.e with multiple affiliations)
Working on: https://api.elsevier.com/content/abstract/scopus_id/85110415392
Working on: https://api.elsevier.com/content/abstract/scopus_id/85110393196
Working on: https://api.elsevier.com/content/abstract/scopus_id/85109710347
Working on: https://api.elsevier.com/content/abstract/scopus_id/85108952537
Working on: https://api.elsevier.com/content/abstract/scopus_id/85108578051
Working on: https://api.elsevier.com

In [106]:
for pub in collection_dict.keys():
    pub_dict[pub] = {}
    pub_dict[pub]['{}_core_df'.format(pub)] = None
    pub_dict[pub]['{}_author_abstract_df'.format(pub)] = None
    pub_dict[pub]['{}_cites_df'.format(pub)] = None

for pub in collection_dict.keys():
    pub_code = pub
    pub_issn = collection_dict.get(pub).get('issn')
    pub_start_year = collection_dict.get(pub).get('start_year')
    pub_name = collection_dict.get(pub).get('print_name')


    print(pub_name)
    pub_dict[pub_code]['{}_core_df'.format(pub_code)] = pd.read_csv('scopus_data/{}_scopus_core.csv'.format(pub_code))
    print(len(pub_dict[pub_code]['{}_core_df'.format(pub_code)]))
    print('-------------------------------------------------')

Econometrica
1684
-------------------------------------------------
Journal of Political Economy
1331
-------------------------------------------------
Quarterly Journal of Economics
1321
-------------------------------------------------
American Economic Review
5139
-------------------------------------------------
Review of Economic Studies
1462
-------------------------------------------------
RAND Journal of Economics
1195
-------------------------------------------------
Journal of Finance
2510
-------------------------------------------------
Journal of Financial Economics
2873
-------------------------------------------------
Review of Financial Studies
2078
-------------------------------------------------
Journal of Economics and Management Strategy
746
-------------------------------------------------
Antitrust Law Journal
440
-------------------------------------------------
Journal of Law and Economics
729
-------------------------------------------------
Journal of Law, Ec

In [114]:
#Given a pub_code (see 'collection_dict' keys), this function identifies the publication-specific df generated by 'publication_collect', and generates two new dfs: 1) authors and abstract and funding; and 2) the citations that the article of interest makes. It then merges these back on to the pub_df (1:m) and returns that updated pub_df. We DO NOT merge on 'doi' (even though it should be a unique identifier) because if a SCOPUS object does not have an doi it the obersvation is coded as 'SCOPUS FAILURE' generically. This causes a (m:m) merge that is way too big/inaccurate. Instead we merge on 'sc_abstract_endpoint' which is a unique identifier generated by SCOPUS.
def abstract_references_collect(pub_code):
    pub_df = pub_dict.get(pub_code).get('{}_core_df'.format(pub_code))
    abstract_query = {
        'httpAccept' : 'application/json',
        'view' : 'FULL'
    }

    articles = pub_df[['doi', 'sc_abstract_api_endpoint']]
    articles.reset_index(inplace=True)
    # print(articles)

    authors_abstracts_df = pd.DataFrame()
    article_cites_df = pd.DataFrame()

    # WE ARE GOING TO GO THROUGH EVERY ARTICLE IN THIS PUBLICATION 
    for row in range(0, len(articles)):
        doi = articles.loc[row, 'doi']
        url = articles.loc[row, 'sc_abstract_api_endpoint']
        print('Working on: {}'.format(url))

        # CLEAR TEMP DFS (These contain all the ____'s for a given article observation)
        aa_df_temp = pd.DataFrame()
        article_cites_df_temp = pd.DataFrame()
    

        abstract_r = requests.get(url, headers=req_headers, params=abstract_query)
        if abstract_r.status_code != 200:
            continue

        r_json_object = abstract_r.json()

        author_affil_navigation_dict = {
            'sc_author_id' : "['author'][0]['@auid']",
            'sc_author_given_name' : "['author'][0]['preferred-name']['ce:given-name']",
            'sc_author_last_name' : "['author'][0]['preferred-name']['ce:surname']",
            'sc_author_indexed_name' : "['author'][0]['preferred-name']['ce:indexed-name']",
            'sc_author_affil_id' : "['affiliation']['affiliation-id']['@afid']",
            'sc_author_affil_indexed' : "['affiliation']['ce:source-text']",
        }


        aa_funding_navigation_dict = {
            'sc_grant_text' : "['abstracts-retrieval-response']['item']['bibrecord']['head']['grantlist']['grant-text']['$']",
            'sc_funding_text' : "['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding-text']",
            'sc_funding_agency' : "['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding']['xocs:funding-agency']",
        }


        author_abstract_obs = pd.DataFrame({
                'sc_author_id' : [None],
                'sc_author_given_name' : [None],
                'sc_author_last_name' : [None],
                'sc_author_affil_id' : [None],
                'sc_author_affil_indexed' : [None],
                'sc_grant_text' :  [None],
                'sc_funding_text' :  [None],
                'sc_funding_agency' :  [None],
            })




        ###################################
        # WE DO AUTHORS AND ABSTRACTS FIRST
        ###################################
        try: 
            abstract_text = r_json_object['abstracts-retrieval-response']['item']['bibrecord']['head']['abstracts']
            authors_object = r_json_object['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
           
            aa_df_temp = author_obs_function(authors_object, aa_df_temp, author_abstract_obs, author_affil_navigation_dict)
             
            ### TRYING TO GET INFORMATION ON FUNDING IF ITS AVAILABLE
            # EXPECT QUITE A FEW FAILURES/ 'SCOPUS FAILURES' IN THIS SECTION
            aa_df_temp = funding_obs_function(aa_df_temp, r_json_object, aa_funding_navigation_dict)


            aa_df_temp['sc_abstract_text'] = abstract_text
            aa_df_temp['sc_abstract_api_endpoint'] = url


        except:
            print("FAILURE TO IDENTIFY ABSTRACT/AUTHORS FOR ARTICLE {}\n\t ENDPOINT QUERIED WAS: {}".format(doi, url))

            traceback = sys.exc_info()
            failure_explanation = traceback[1]
            failure_position = traceback[2].tb_lineno

            # ABSTRACT IDENTIFICATION FAILURE
            if failure_position == 65:
                print('\t FILLING IN FAILURE ON IDENTIFYING ABSTRACT IN JSON FOR ARTICLE: {}'.format(doi))
                aa_df_temp = author_obs_function(authors_object, aa_df_temp, author_abstract_obs)
                aa_df_temp = funding_obs_function(aa_df_temp, r_json_object, aa_funding_navigation_dict)

                aa_df_temp['sc_abstract_api_endpoint'] = url

            # AUTHOR IDENTIFICATION FAILURE
            elif failure_position == 66:
                print('\t FILLING IN FAILURE ON IDENTIFYING AUTHORS IN JSON FOR ARTICLE: {}'.format(doi))
                aa_df_temp = author_abstract_obs
                aa_df_temp = aa_df_temp = funding_obs_function(aa_df_temp, r_json_object, aa_funding_navigation_dict)
   
                aa_df_temp['sc_abstract_api_endpoint'] = url  
                aa_df_temp['sc_abstract_text'] = abstract_text       


            print("\tFAILED ON LINE {}".format(failure_position))
            print("\tFAILED BECAUSE {}".format(failure_explanation))


        #################################
        # WE DO REFERENCES AND CITES NEXT
        #################################

        try: 
            references_object = r_json_object['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']

            article_cites_df_temp = references_function(references_object, doi, url)

        except:

            article_cites_df_temp = pd.DataFrame({
                'doi' : [doi],
                'sc_abstract_api_endpoint' : [url],
                'sc_article_cites_scopus_citation_text' : [None],
                'sc_article_cites_scopus_group_id' : [None],
                'sc_article_cites_api_endpoint_list' : [None]
            })

            print("FAILURE TO IDENTIFY REFERENCES FOR ARTICLE {}\n\t ENDPOINT QUERIED WAS: {}".format(doi, url))

            traceback = sys.exc_info()
            failure_explanation = traceback[1]
            failure_position = traceback[2].tb_lineno

        

        # CONCATENATE THE ARTICLES AUTHORS/ABSTRACTS AND REFERENCES TO THE PUBLICATION-LEVEL DFS, RESPECTIVELY

        authors_abstracts_df = pd.concat([authors_abstracts_df, aa_df_temp], ignore_index=True)
        article_cites_df = pd.concat([article_cites_df, article_cites_df_temp], ignore_index=True)


        time.sleep(0.2)

    
    # MERGE THE AUTHORS/ABSTRACTS AND REFERENCES DATA ONTO THE THE CORE ARTICLE DATA (AT THE pub_df LEVEL)
    
    # author_abstract_funding_df = pd.merge(pub_df, authors_abstracts_df, how='left', on='doi')
    # cites_df = pd.merge(pub_df, article_cites_df, how='left', on='doi')
    
    author_abstract_funding_df = pd.merge(pub_df, authors_abstracts_df, how='left', on='sc_abstract_api_endpoint')
    cites_df = pd.merge(pub_df, article_cites_df, how='left', on='sc_abstract_api_endpoint')

    

    return author_abstract_funding_df, cites_df
    


In [112]:
def author_obs_function(authors_json_object, df_so_far, author_abstract_obs_df, author_affil_navigation_dict):
    # IF there is only a single author then this is a json/dict...
    if type(authors_json_object) == dict:
        # print('authors_json_object that `author_obs_function()` received was a dictionary.')
        aa_json = authors_json_object
        author_abstract_obs = author_abstract_obs_df


        if len(authors_json_object['author']) > 1:
            # print('do the different things')
            for i, author_with_same_affil in enumerate(authors_json_object['author']):
                author_affil_navigation_dict = {
                            'sc_author_id' : "['author'][0]['@auid']",
                            'sc_author_given_name' : "['author'][0]['preferred-name']['ce:given-name']",
                            'sc_author_last_name' : "['author'][0]['preferred-name']['ce:surname']",
                            'sc_author_indexed_name' : "['author'][0]['preferred-name']['ce:indexed-name']",
                            'sc_author_affil_id' : "['affiliation']['affiliation-id']['@afid']",
                            'sc_author_affil_indexed' : "['affiliation']['ce:source-text']",
                        }
                for field in author_affil_navigation_dict:
                    author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field, optional_dict_modification=i)
            aa_df_temp = pd.concat([df_so_far, author_abstract_obs], ignore_index=True)
            return aa_df_temp
        else:
            # print('do the standard thing')
                    
            for field in author_affil_navigation_dict:
                author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field)

            aa_df_temp = pd.concat([df_so_far, author_abstract_obs], ignore_index=True)
            return aa_df_temp

    #... but if there are multiple authors this is a list of jsons/dicts
    else:
        # print('authors_json_object that `author_obs_function()` received was a list (of dictionaries?).')
        aa_df_temp = df_so_far
        for author_affil in authors_json_object:

            author_abstract_obs = pd.DataFrame({
                'sc_author_id' : [None],
                'sc_author_given_name' : [None],
                'sc_author_last_name' : [None],
                'sc_author_affil_id' : [None],
                'sc_author_affil_indexed' : [None],
                'sc_grant_text' :  [None],
                'sc_funding_text' :  [None],
                'sc_funding_agency' :  [None],
            })

            aa_json = author_affil

            if len(aa_json['author']) > 1:
                # print('do the different things')
                for j, author_with_same_affil in enumerate(aa_json['author']):
                    author_affil_navigation_dict = {
                            'sc_author_id' : "['author'][0]['@auid']",
                            'sc_author_given_name' : "['author'][0]['preferred-name']['ce:given-name']",
                            'sc_author_last_name' : "['author'][0]['preferred-name']['ce:surname']",
                            'sc_author_indexed_name' : "['author'][0]['preferred-name']['ce:indexed-name']",
                            'sc_author_affil_id' : "['affiliation']['affiliation-id']['@afid']",
                            'sc_author_affil_indexed' : "['affiliation']['ce:source-text']",
                        }
                    for field in author_affil_navigation_dict:
                        author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field, optional_dict_modification=j)

                aa_df_temp = pd.concat([aa_df_temp, author_abstract_obs], ignore_index=True)
                
            else:
                # print('do the standard thing thing')
                author_affil_navigation_dict = {
                            'sc_author_id' : "['author'][0]['@auid']",
                            'sc_author_given_name' : "['author'][0]['preferred-name']['ce:given-name']",
                            'sc_author_last_name' : "['author'][0]['preferred-name']['ce:surname']",
                            'sc_author_indexed_name' : "['author'][0]['preferred-name']['ce:indexed-name']",
                            'sc_author_affil_id' : "['affiliation']['affiliation-id']['@afid']",
                            'sc_author_affil_indexed' : "['affiliation']['ce:source-text']",
                        }


                for field in author_affil_navigation_dict:
                    author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field)

                aa_df_temp = pd.concat([aa_df_temp, author_abstract_obs], ignore_index=True)

        # There are some journals which have a strange formatting thing going on whereby a single author with multiple affiliations becomes multiple authors each with a single affiliation. What follows concatenates those "sc_author_affil_id" and "sc_author_affil_indexed"
        
        if True in aa_df_temp.duplicated(subset=['sc_author_id']).unique().tolist():
            print('found a duplicated author (i.e with multiple affiliations)')
            affil_id_concat = aa_df_temp.groupby('sc_author_id')['sc_author_affil_id'].apply(lambda x: '+'.join(x)).reset_index()
            affil_index_concat = aa_df_temp.groupby('sc_author_id')['sc_author_affil_indexed'].apply(lambda x: '+'.join(x)).reset_index()

            concatenated_obs = pd.concat([affil_id_concat, affil_index_concat['sc_author_affil_indexed']], axis=1)
            with_concat_df = pd.merge(aa_df_temp, concatenated_obs, how='left', on='sc_author_id')

            with_concat_df = with_concat_df.drop(['sc_author_affil_id_x', 'sc_author_affil_indexed_x'], axis=1)
            with_concat_df.rename(columns={
                'sc_author_affil_id_y' : 'sc_author_affil_id',
                'sc_author_affil_indexed_y' : 'sc_author_affil_indexed',
            }, inplace=True)
            aa_df_temp = with_concat_df[~with_concat_df.duplicated(subset=['sc_author_id'], keep='first')]


        return aa_df_temp

In [11]:
def funding_obs_function(df_so_far, json_object, aa_funding_navigation_dict):
    aa_df_temp = df_so_far
    for field in aa_funding_navigation_dict:
        aa_df_temp = field_population(aa_df_temp, json_object, aa_funding_navigation_dict, field)

    return aa_df_temp

In [12]:
def references_function(references_object, doi, url):
    references_list = references_object['reference']

    sc_article_cites_scopus_group_id_list = []
    sc_article_cites_scopus_citation_text_list = []
    sc_article_cites_api_endpoint_list = []

    for ref in references_list:
        # The bibliography returns itemids of scopus-group id types rather than scopus id type. I don't think it should matter long-run because the articles can still be accessed by the 'https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}' API. For more info see: https://silo.tips/download/sciverse-scopus-custom-data-documentation page 59.

        try:
            sc_article_cites_scopus_citation_text = ref['ref-fulltext']
        except:
            sc_article_cites_scopus_citation_text = "SCOPUS FAILURE"
        try:
            sc_article_cites_scopus_group_id = ref['ref-info']['refd-itemidlist']['itemid']['$']
        except:
            sc_article_cites_scopus_group_id = "SCOPUS FAILURE"

        if sc_article_cites_scopus_group_id == "SCOPUS FAILURE":
            sc_article_cites_api_endpoint = "SCOPUS FAILURE"
        else:
            sc_article_cites_api_endpoint = 'https://api.elsevier.com/content/abstract/scopus_id/{}'.format(sc_article_cites_scopus_group_id)


        sc_article_cites_scopus_group_id_list.append(sc_article_cites_scopus_group_id)
        sc_article_cites_scopus_citation_text_list.append(sc_article_cites_scopus_citation_text)
        sc_article_cites_api_endpoint_list.append(sc_article_cites_api_endpoint)

    article_cites_df_temp = pd.DataFrame({
        'doi' : doi,
        'sc_abstract_api_endpoint' : url,
        'sc_article_cites_scopus_citation_text' : sc_article_cites_scopus_citation_text_list,
        'sc_article_cites_scopus_group_id' : sc_article_cites_scopus_group_id_list,
        'sc_article_cites_api_endpoint_list' : sc_article_cites_api_endpoint
    })

    return article_cites_df_temp 

In [None]:
## JOURNAL LEVEL CHECK API CALL

human_query_issn = 'ISSN({issn_str})'.format(issn_str='00129682')
human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=1990,
                                                                current_year_str=2021)


url = 'https://api.elsevier.com/content/search/scopus'

call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '25',
                'cursor' : '*',
                'view' : 'COMPLETE'
            }

# THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (x AT A TIME BY USING 'cursor/@next')
test = requests.get(url,
                headers=req_headers,
                params=call_query
                )

test.json()



In [157]:
# ARTICLE/ABSTRACT LEVEL CHECK API CALL
abstract_url = 'https://api.elsevier.com/content/abstract/scopus_id/85116396273'

abstract_test = requests.get(abstract_url,
    headers = req_headers,
    params = {
        'httpAccept' : 'application/json',
        'view' : 'FULL'
    })

x = abstract_test.json()
x

{'abstracts-retrieval-response': {'item': {'ait:process-info': {'ait:status': {'@state': 'update',
     '@type': 'core',
     '@stage': 'S300'},
    'ait:date-delivered': {'@day': '12',
     '@timestamp': '2022-01-12T19:09:46.000046-05:00',
     '@year': '2022',
     '@month': '01'},
    'ait:date-sort': {'@day': '01', '@year': '2021', '@month': '12'}},
   'bibrecord': {'head': {'author-group': {'affiliation': {'country': 'South Africa',
       '@afid': '60197573',
       '@country': 'zaf',
       'city': 'Johannesburg',
       'organization': [{'$': 'Centre for Competition'},
        {'$': 'Regulation and Economic Development'},
        {'$': 'College of Business and Economics'},
        {'$': 'University of Johannesburg'}],
       'affiliation-id': [{'@afid': '60197573', '@dptid': '127012301'},
        {'@afid': '60000717'}],
       '@affiliation-instance-id': '2013838812-a4c3d7058e1565447edba12fa91f6f46',
       'ce:source-text': 'Centre for Competition, Regulation and Economic Deve

In [158]:
abstract_test.headers   

{'Date': 'Mon, 25 Jul 2022 15:38:04 GMT', 'Content-Type': 'application/json;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'allow': 'GET', 'Content-Encoding': 'gzip', 'Last-Modified': 'Thu, 13 Jan 2022 18:22:38 GMT', 'Vary': 'Accept-Encoding, Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'X-ELS-APIKey': '06b7983cb6bd8e3b62addc9e2ebb4944', 'X-ELS-ReqId': 'b7fa3abd2ea4dad0', 'X-ELS-ResourceVersion': 'default', 'X-ELS-Status': 'OK', 'X-ELS-TransId': 'd0a19fa5f1fcadb1', 'X-RateLimit-Limit': '10000', 'X-RateLimit-Remaining': '10000', 'X-RateLimit-Reset': '1659324982', 'CF-Cache-Status': 'DYNAMIC', 'Expect-CT': 'max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"', 'Server': 'cloudflare', 'CF-RAY': '7305fb028c4e8704-ORD'}