In [395]:
### LOADING NECESSARY PACKAGES
import pandas as pd
import requests
import json
import time
import numpy as np
import traceback
from tqdm.notebook import tqdm
from datetime import date

In [6]:
# Accessing secure API keys
keys_json = json.load(open("env_keys.json"))
scopus_key = keys_json['scopus_key']

744c08b8143a77bb752f8b818fd65171


In [45]:
### SCOPUS HEADERS
req_headers = {
    'X-ELS-APIKey' : scopus_key
}

In [423]:
# This is a function that helps to fill out article-level observations from the API response without throwing too many failures or blowing up the whole process without getting at least the most important information. It is also used to collect as much author/abstract/reference information as possible without killing the whole process.
def field_population(temp_df, json_obj, navigation_dict, field):
    # This is the json object that we want to explore. It could be an article- or an author-observation, for example.
    json_object = json_obj
    # print(json_object)
    # pub_name = article_obj['prism:publicationName']
    
    try:
        ## We have only a small number of special cases so that the general function holds as much as possible:
        if field == 'sc_author_affil_id':
            # SOMETIMES AN AUTHOR HAS MULTIPLE AFFILIATIONS (EG NBER AND HARVARD). IN THIS CASE, WE CONCATENATE ALL OF THE AFFILIATIONS, DELIMITED BY '+' SYMBOLS AND STORE THEM AS A STRING (TO BE UNNESTED LATER)
            affiliation_id_list_of_dicts  = json_object['affiliation']['affiliation-id']
            if type(affiliation_id_list_of_dicts) == list:
                multi_affil_list = []
                for affil in affiliation_id_list_of_dicts:
                    multi_affil_list.append(affil['@afid'])
                sc_author_affil_id = '+'.join(multi_affil_list)
                temp_df[field] = sc_author_affil_id
            else:
                exec('temp_df["{}"] = json_object{}'.format(field, navigation_dict[field]))
        elif field == 'sc_fudning_agency':
            ## SOMETIMES THERE ARE MULTIPLE FUNDING AGENCIES. IN THIS CASE, WE CONCATENATE ALL OF THOSE AGENCIES, DELIMITED BY '+' SYMBOLS AND STORE THEM AS A STRING. I THINK THIS IS OKAY BECAUSE IF THERE IS AN IDENTIFIED FUNDING AGENCY, SCOPUS HAS ALSO COLLECTED (AT LEAST PART OF) THE 'THANKS' FOOTNOTE THAT DISCLOSES IT
            funding_agencies_list_of_dicts  = json_object['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding']
            if type(json_object) == list:
                multi_agency_list = []
                for agency in multi_agency_list:
                    multi_agency_list.append(agency['xocs:funding-agency'])
                sc_funding_agency = '+'.join(multi_agency_list)
                temp_df[field] = sc_funding_agency
            else:
                exec('temp_df["{}"] = json_object{}'.format(field, navigation_dict[field]))
        else: 
            exec('temp_df["{}"] = json_object{}'.format(field, navigation_dict[field]))
        # eval('print(article_obj{})'.format(article_obj_nav_dict[field]))
        


    except Exception as e:

        print("SCOPUS FAILURE ON: {}".format(field))
        # print(e)
        # print(traceback.format_exc())
        # print(json_object)
        print('------------------------------------------------------')
        temp_df[field] = 'SCOPUS FAILURE'

    return temp_df

In [421]:
# Given an issn and a year in which to begin, this function collects all articles from the SCOPUS Search API and returns a publication-specific df 
def publication_collect(issn:str, start_year:str, journal_name:str):
    # INSTANTIATE AN EMPTY DATAFRAME THAT WILL CONTAIN ALL OF THE RESULTS FOR THIS PUBLICATION
    pub_df = pd.DataFrame()


    #### FIRST WE NEED TO SEE HOW MANY ARTICLES WE NEED TO COLLECT FROM THIS PUBLICATION
    # How to construct a scopus query
    # Guide: https://dev.elsevier.com/sc_search_tips.html
    # Practice at: https://www.scopus.com/search/form.uri?display=advanced


    current_year = date.today().year - 1
    human_query_issn = 'ISSN({issn_str})'.format(issn_str=issn)
    human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=start_year,
                                                                    current_year_str=current_year)    
    req_headers = {
        'X-ELS-APIKey' : scopus_key
    }

    prelim_query = {
        'httpAccept' : 'application/json',
        'query' : human_query_issn,
        'date' : human_query_date,
        'count' : '1',
        'cursor' : '*'
    }

    prelim_r = requests.get('https://api.elsevier.com/content/search/scopus',
                            headers=req_headers,
                            params=prelim_query)
    print("PRELIM {pub_name} API STATUS CODE: {status}".format(pub_name=journal_name, status=prelim_r.status_code))
    prelim_json = prelim_r.json()
    # print(prelim_json)
    article_count = int(prelim_json['search-results']['opensearch:totalResults'])
    print("PRELIM COUNT OF ARTICLES FOUND BY API: {}".format(article_count))
    print("-----------------------------------------------------")

    

    still_more_pages_to_call = True

    nth_call_counter = 1
    ### THE API WILL ONLY RETURN 200 ARTICLES AT TIME, SO WE NEED TO CREATE A WHILE-LOOP THAT PULLS 200 RESULTS AT A TIME AS LONG AS THE CURRENT RESPONSE LINKS TO A POTENTIAL 'next' PAGE.
    while still_more_pages_to_call:     
        call_df = pd.DataFrame()
        if nth_call_counter ==1: 
            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : '*' 
            }
            nth_call_counter +=1
        else:
            nth_call_counter +=1
            cursor_next_hash = r_json['cursor']['@next']
            # print("cursor next hash value: {}".format(cursor_next_hash))
            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : cursor_next_hash
            }
        

        # print(call_query)

        # THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (200 AT A TIME BY USING 'cursor/@next')
        r = requests.get('https://api.elsevier.com/content/search/scopus',
                        headers=req_headers,
                        params=call_query)
        print("---------------------------------------------")
        print("CALL {n} FOR {pub_name} AND {count} ARTICLES RETURNING STATUS CODE: {code}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter -1,
                                                                                    count=call_query['count'],
                                                                                    code=r.status_code))
        
        # print(r.json())
        r_json = r.json()['search-results']

        if r_json['cursor']['@current'] == r_json['cursor']['@next']:
            print("We have reached the end of the 'cursor-next' chain. breaking out of this pub")
            still_more_pages_to_call = False
            break
        else:
            print("Going to finish collecting this page and then there still at least one more to go.")

        # NOW WE BEGIN UNPACKING EACH BATCH OF 200 TO STORE IN A TEMP DF
        sc_query_used = r_json['link'][0]['@href']
        r_results = r_json['entry']
        print("{pub_name} CALL {n} FOR {count} ARTICLES FOUND RESULTS: {num}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter,
                                                                                    count=call_query['count'],
                                                                                    num=len(r_results)))
        
        ## THIS DICTIONARY SHOULD BE STRUCTURED IN ORDER OF IMPORTANCE. IE. PULLING THE API ENDPOINT IS MORE IMPORTANT THAN PULLING THE DOI, WHICH IS MORE IMPORTANT THE TITLE ETC.
        article_obj_nav_dict = {
            'sc_abstract_api_endpoint' : '["link"][0]["@href"]',
            'doi' : "['prism:doi']",
            'sc_title' : "['dc:title']",
            'sc_issn' : "['prism:issn']",
            'sc_pub_name' : "['prism:publicationName']",
            'sc_pub_date' : "['prism:coverDate']",
            'sc_open_access_status' : "['openaccessFlag']",
            'sc_vol' : "['prism:volume']",
            'sc_issue' : "['prism:issueIdentifier']",
            'sc_page_range' : "['prism:pageRange']",
            'sc_human_url' : "['link'][3]['@href']"
        }

        for article_obj in r_results:
            # CONSTRUCT A TEMP_DF THAT CONTAINS A SINGLE ARTICLE THAT WILL BE APPENDED TO call_df
            temp_df = pd.DataFrame({
                'doi' : [None],
                'sc_title' : [None],
                'sc_issn' : [None],
                'sc_pub_name' : [None],
                'sc_vol' : [None],
                'sc_issue' : [None],
                'sc_page_range' : [None],
                'sc_abstract_api_endpoint' : [None],
                'sc_human_url' : [None]
            })
            for field in article_obj_nav_dict:
                # FOR EACH OF THE FIELDS IDENTIFIED ABOVE (article_obj_nav_dict), EXECUTE THE field population FUNCTION THAT WILL TRY TO ACCESS THAT FIELD ACCORDING TO THE GIVEN SUBSCRIPT. IF UNAVAILABLE, CONTINUE WITH NOTATION OF FAILURE
                temp_df = field_population(temp_df, article_obj, article_obj_nav_dict, field)

            # ADD THE ARTICLE TO call_df
            # print(temp_df)
            call_df = pd.concat([call_df, temp_df], ignore_index=True)

        #ADD TO THE call_df THE CONSTRUCTED API-ENDPOINT QUERY THAT WAS USED TO GENERATE ALL OF THE RESULTS (ARTICLES) FOR THIS call_df
        call_df['sc_query_used'] = sc_query_used

        # ADD THE 200-BATCH OF ARTICLES TO pub_df
        pub_df = pd.concat([pub_df, call_df], ignore_index=True)

        ### TIMER HERE TO ENSURE WE DON'T EXCEED THE SCOPUS API'S QUERY THROTTLE
        time.sleep(0.15)


    return pub_df

In [422]:
#### MASTER RUN BLOCK HERE

collection_dict = {
    'JPE' : {
        'issn' : '00223808',
        'start_year' : '1990',
        'print_name' : 'Journal of Political Economy'
    },
    'QJE' : {
        'issn' : '00335533',
        'start_year' : '1990',
        'print_name' : 'Quarterly Journal of Economics'
    }, 
    'AER' : {
        'issn' : '00028282',
        'start_year' : '1990',
        'print_name' : 'American Economic Review'
    },
    'RES' : {
        'issn' : '00346527',
        'start_year' : '1990',
        'print_name' : 'Review of Economic Studies' 
    },
    'ECA' : {
        'issn' : '00129682',
        'start_year' : '1990',
        'print_name' : 'Econometrica'
    },
    'RJE' : {
        'issn' : '07416261',
        'start_year' : '1990',
        'print_name' : 'RAND Journal of Economics'
    }
}

# This is the dictionary that we are going to use to store and access the publication-specific dfs. We will programtically generate it based on the collection order (see above dict)
pub_dict = {}

for pub in collection_dict.keys():
    pub_dict[pub] = {}
    pub_dict[pub]['{}_core_df'.format(pub)] = None
    pub_dict[pub]['{}_author_abstract_df'.format(pub)] = None
    pub_dict[pub]['{}_cites_df'.format(pub)] = None

for pub in collection_dict.keys():


    ##### 1. FIND ALL OF THE ARTICLES PUBLISHED IN THE JOURNALS OF INTEREST

    pub_code = pub
    pub_issn = collection_dict.get(pub).get('issn')
    pub_start_year = collection_dict.get(pub).get('start_year')
    pub_name = collection_dict.get(pub).get('print_name')

    # exec('{}_core_df = publication_collect("{}", "{}","{}")'.format(pub_code, pub_issn, pub_start_year, pub_name))
    # exec('pub_dict["{}"]["{}_core_df"] = {}_core_df'.format(pub_code,pub_code, pub_code))



    ##### 2. FIND THOSE ARTICLES' A) AUTHORS, ABSTRACTS, AND FUNDERS; AND B) PRIOR ARTICLES THAT THEY CITED


pub_dict


PRELIM Journal of Political Economy API STATUS CODE: 200
PRELIM COUNT OF ARTICLES FOUND BY API: 1331
-----------------------------------------------------
---------------------------------------------
CALL 1 FOR Journal of Political Economy AND 200 ARTICLES RETURNING STATUS CODE: 200
Going to finish collecting this page and then there still at least one more to go.
Journal of Political Economy CALL 2 FOR 200 ARTICLES FOUND RESULTS: 200
---------------------------------------------
CALL 2 FOR Journal of Political Economy AND 200 ARTICLES RETURNING STATUS CODE: 200
Going to finish collecting this page and then there still at least one more to go.
Journal of Political Economy CALL 3 FOR 200 ARTICLES FOUND RESULTS: 200
SCOPUS FAILURE ON: sc_vol
Traceback (most recent call last):
  File "C:\Users\Joshualevy\AppData\Local\Temp\ipykernel_1604\1523771043.py", line 33, in field_population
    exec('temp_df["{}"] = json_object{}'.format(field, navigation_dict[field]))
  File "<string>", line 1, 

{'JPE': {'JPE_core_df':                  doi                                           sc_title  \
  0     10.1086/716564      Financial development and international trade   
  1     10.1086/716563  Efficiency and foreclosure effects of vertical...   
  2     10.1086/716562  Child’s gender, young fathers’ crime, and spil...   
  3     10.1086/716559                   Forecast hedging and calibration   
  4     10.1086/716561  How does incarceration affect reoffending? Est...   
  ...              ...                                                ...   
  1326  10.1086/261727  Public policy and economic growth: developing ...   
  1327  10.1086/261732  Collectivization and China's agricultural cris...   
  1328  10.1086/261692  International evidence on the size of the rand...   
  1329  10.1086/261708  An estimate of a sectoral model of labor mobility   
  1330  10.1086/261724  Population growth and human capital investment...   
  
         sc_issn                   sc_pub_name sc_v

In [424]:
#Given a pub_code (see 'collection_dict' keys), this function identifies the publication-specific df generated by 'publication_collect', and generates two new dfs: 1) authors and abstract and funding; and 2) the citations that the article of interest makes. It then merges these back on to the pub_df (1:m) and returns that updated pub_df. We ALWAYS merge on 'doi' because it is universal (so we can merge with other datasets later).
def abstract_references_collect(pub_code):
    pub_df = pub_dict.get(pub_code).get('{}_core_df'.format(pub_code))

    abstract_query = {
        'httpAccept' : 'application/json',
        'view' : 'FULL'
    }

    articles = pub_df.head(2)[['doi', 'sc_abstract_api_endpoint']]

    authors_abstracts_df = pd.DataFrame()
    article_cites_df = pd.DataFrame()

    # WE ARE GOING TO GO THROUGH EVERY ARTICLE IN THIS PUBLICATION 
    for row in range(0, len(articles)):
        doi = articles.loc[row, 'doi']
        url = articles.loc[row, 'sc_abstract_api_endpoint']

        # CLEAR TEMP DFS (These contain all the ____'s for a given article observation)
        aa_df_temp = pd.DataFrame()
        article_cites_df_temp = pd.DataFrame()
    

        abstract_r = requests.get(url, headers=req_headers, params=abstract_query)
        if abstract_r.status_code != 200:
            continue

        abstract_r_json = abstract_r.json()

        ###################################
        # WE DO AUTHORS AND ABSTRACTS FIRST
        ###################################

        authors_object = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
        abstract_text = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['head']['abstracts']


        author_affil_navigation_dict = {
            'sc_author_id' : "['author'][0]['@auid']",
            'sc_author_given_name' : "['author'][0]['preferred-name']['ce:given-name']",
            'sc_author_last_name' : "['author'][0]['preferred-name']['ce:surname']",
            'sc_author_indexed_name' : "['author'][0]['preferred-name']['ce:indexed-name']",
            'sc_author_affil_id' : "['affiliation']['affiliation-id']['@afid']",
            'sc_author_affil_indexed' : "['affiliation']['ce:source-text']",
        }

        author_abstract_obs = pd.DataFrame({
            'sc_author_id' : [None],
            'sc_author_given_name' : [None],
            'sc_author_last_name' : [None],
            'sc_author_affil_id' : [None],
            'sc_author_affil_indexed' : [None],
        })

        # IF there is only a single author then this is a json/dict...
        # print(authors_object)
        if type(authors_object) == dict:
            aa_json = authors_object

            for field in author_affil_navigation_dict:
                author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field)

            author_abstract_obs['sc_abstract_text'] = abstract_text
            author_abstract_obs['doi'] = doi
            aa_df_temp = pd.concat([aa_df_temp, author_abstract_obs], ignore_index=True)


        #... but if there are multiple authors this is a list of jsons/dicts
        else:
            for author_affil in authors_object:

                author_abstract_obs = pd.DataFrame({
                    'sc_author_id' : [None],
                    'sc_author_given_name' : [None],
                    'sc_author_last_name' : [None],
                    'sc_author_affil_id' : [None],
                    'sc_author_affil_indexed' : [None],
                })

                aa_json = author_affil

                for field in author_affil_navigation_dict:
                    author_abstract_obs = field_population(author_abstract_obs, aa_json, author_affil_navigation_dict, field)

                aa_df_temp = pd.concat([aa_df_temp, author_abstract_obs], ignore_index=True)
                
            aa_df_temp['sc_abstract_text'] = abstract_text
            aa_df_temp['doi'] = doi

        ### TRYING TO GET INFORMATION ON FUNDING IF ITS AVAILABLE
        # EXPECT QUITE A FEW FAILURES/ 'SCOPUS FAILURES' IN THIS SECTION
        aa_funding_navigation_dict = {
            'sc_grant_text' : "['abstracts-retrieval-response']['item']['bibrecord']['head']['grantlist']['grant-text']['$']",
            'sc_funding_text' : "['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding-text']",
            'sc_funding_agency' : "['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding']['xocs:funding-agency']",
        }

        for field in aa_funding_navigation_dict:
            aa_df_temp = field_population(aa_df_temp, abstract_r_json, aa_funding_navigation_dict, field)


        #################################
        # WE DO REFERENCES AND CITES NEXT
        #################################

        references_object = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']

        print(references_object['@refcount'])
        references_list = references_object['reference']

        sc_article_cites_scopus_group_id_list = []
        sc_article_cites_scopus_citation_text_list = []
        sc_article_cites_api_endpoint_list = []

        for ref in references_list:
            # The bibliography returns itemids of scopus-group id types rather than scopus id type. I don't think it should matter long-run because the articles can still be accessed by the 'https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}' API. For more info see: https://silo.tips/download/sciverse-scopus-custom-data-documentation page 59.

            try:
                sc_article_cites_scopus_citation_text = ref['ref-fulltext']
            except:
                sc_article_cites_scopus_citation_text = "SCOPUS FAILURE"
            try:
                sc_article_cites_scopus_group_id = ref['ref-info']['refd-itemidlist']['itemid']['$']
            except:
                sc_article_cites_scopus_group_id = "SCOPUS FAILURE"

            if sc_article_cites_scopus_group_id == "SCOPUS FAILURE":
                sc_article_cites_api_endpoint = "SCOPUS FAILURE"
            else:
                sc_article_cites_api_endpoint = 'https://api.elsevier.com/content/abstract/scopus_id/{}'.format(sc_article_cites_scopus_group_id)


            sc_article_cites_scopus_group_id_list.append(sc_article_cites_scopus_group_id)
            sc_article_cites_scopus_citation_text_list.append(sc_article_cites_scopus_citation_text)
            sc_article_cites_api_endpoint_list.append(sc_article_cites_api_endpoint)

        article_cites_df_temp = pd.DataFrame({
            'doi' : doi,
            'sc_article_cites_scopus_citation_text' : sc_article_cites_scopus_citation_text_list,
            'sc_article_cites_scopus_group_id' : sc_article_cites_scopus_group_id_list,
            'sc_article_cites_api_endpoint_list' : sc_article_cites_api_endpoint
        })


        

        # CONCATENATE THE ARTICLES AUTHORS/ABSTRACTS AND REFERENCES TO THE PUBLICATION-LEVEL DFS, RESPECTIVELY
        authors_abstracts_df = pd.concat([authors_abstracts_df, aa_df_temp], ignore_index=True)
        article_cites_df = pd.concat([article_cites_df, article_cites_df_temp], ignore_index=True)


        time.sleep(0.15)
    
    # print(authors_abstracts_df)
    # print(article_cites_df)
    
    # MERGE THE AUTHORS/ABSTRACTS AND REFERENCES DATA ONTO THE THE CORE ARTICLE DATA (AT THE pub_df LEVEL)
    author_abstract_funding_df = pd.merge(pub_df, authors_abstracts_df, how='left', on='doi')
    cites_df = pd.merge(pub_df, article_cites_df, how='left', on='doi')

    return author_abstract_funding_df, cites_df
        

    

# jpe_author_abstract_funding_df, jpe_cites_df = abstract_references_collect("JPE")



In [None]:
# https://api.elsevier.com/content/search/scopus?cursor=AoJR%2FoJNMjItczIuMC04NTA2NDEzMTMxNg%3D%3D&count=200&query=ISSN%2800223808%29&date=1990-2021
# https://api.elsevier.com/content/abstract/scopus_id/85117518428

# test_hash = 'AoJV7PRLMTItczIuMC0wMDI1NTg2MTMy'

# human_query_issn = 'ISSN({issn_str})'.format(issn_str='00223808')
# human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=1990,
#                                                                     current_year_str=2021)


# call_query = {
#                 'httpAccept' : 'application/json',
#                 'query' : human_query_issn,
#                 'date' : human_query_date,
#                 'count' : '10',
#                 'cursor' : '*',
#                 'view' : 'COMPLETE'
#             }
        
## Grants? : https://api.elsevier.com/content/abstract/doi/10.1086/708815
## Funds? : https://api.elsevier.com/content/abstract/scopus_id/85117518428


# THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (200 AT A TIME BY USING 'cursor/@next')
test = requests.get('https://api.elsevier.com/content/abstract/doi/10.1086/708815',
                headers=req_headers,
                params= {
                    'httpAccept' : 'application/json',
                    'view' : 'FULL',
                    # 'field' : 'dc:title,item,prism:doi,authors,xocs:meta,dc:description'
                },
                # params=call_query
                )


# test.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
test.json()

