In [364]:
### LOADING NECESSARY PACKAGES
import pandas as pd
import requests
import json
import time
import numpy as np
from tqdm.notebook import tqdm
from datetime import date

In [6]:
# Accessing secure API keys
keys_json = json.load(open("env_keys.json"))
scopus_key = keys_json['scopus_key']

744c08b8143a77bb752f8b818fd65171


In [45]:
### SCOPUS HEADERS
req_headers = {
    'X-ELS-APIKey' : scopus_key
}

In [123]:
#How to construct a scopus query
# Guide: https://dev.elsevier.com/sc_search_tips.html
# Practice at: https://www.scopus.com/search/form.uri?display=advanced


# Do NOT use urllib.parse.____ to pre-url-ify the url because the requests package does that for you.
human_query = "ISSN('00223808')"


### SCOPUS ARTICLE QUERY
req_query = {
    'httpAccept' : 'application/json',
    'query' : human_query,
    'date' : '1990-2022',
    'count' : '200',
    'start' : '1201'
}


In [None]:


r = requests.get('https://api.elsevier.com/content/search/scopus',
    headers=req_headers,
    params=req_query)
print(r.status_code)


In [None]:
r.json()
r.json()['search-results']

In [365]:
# This is a function that helps to fill out article-level observations from the API response without throwing too many failures or blowing up the whole process without getting at least the most important information. It is also used to collect as much author/abstract/reference information as possible without killing the whole process.
def field_population(temp_df, json_obj, navigation_dict, field):
    json_object = json_obj
    # pub_name = article_obj['prism:publicationName']
    
    try:
        # print(eval('article_obj{}'.format(article_obj_nav_dict[destination])))
        exec('temp_df["{}"] = json_object{}'.format(field, navigation_dict[field]))
        # eval('print(article_obj{})'.format(article_obj_nav_dict[field]))
        # print(temp_df)
    except Exception as e:

        print("SCOPUS FAILURE ON: {}".format(field))
        # print(e)
        temp_df[field] = 'SCOPUS FAILURE'

    return temp_df

In [366]:
# Given an issn and a year in which to begin, this function collects all articles from the SCOPUS Search API and returns a publication-specific df 
def publication_collect(issn:str, start_year:str, journal_name:str):
    # INSTANTIATE AN EMPTY DATAFRAME THAT WILL CONTAIN ALL OF THE RESULTS FOR THIS PUBLICATION
    pub_df = pd.DataFrame()


    #### FIRST WE NEED TO SEE HOW MANY ARTICLES WE NEED TO COLLECT FROM THIS PUBLICATION
    current_year = date.today().year - 1
    human_query_issn = 'ISSN({issn_str})'.format(issn_str=issn)
    human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=start_year,
                                                                    current_year_str=current_year)    
    req_headers = {
        'X-ELS-APIKey' : scopus_key
    }

    prelim_query = {
        'httpAccept' : 'application/json',
        'query' : human_query_issn,
        'date' : human_query_date,
        'count' : '1',
        'cursor' : '*'
    }

    prelim_r = requests.get('https://api.elsevier.com/content/search/scopus',
                            headers=req_headers,
                            params=prelim_query)
    print("PRELIM {pub_name} API STATUS CODE: {status}".format(pub_name=journal_name, status=prelim_r.status_code))
    prelim_json = prelim_r.json()
    # print(prelim_json)
    article_count = int(prelim_json['search-results']['opensearch:totalResults'])
    print("PRELIM COUNT OF ARTICLES FOUND BY API: {}".format(article_count))
    print("-----------------------------------------------------")

    

    still_more_pages_to_call = True

    nth_call_counter = 1
    ### THE API WILL ONLY RETURN 200 ARTICLES AT TIME, SO WE NEED TO CREATE A WHILE-LOOP THAT PULLS 200 RESULTS AT A TIME AS LONG AS THE CURRENT RESPONSE LINKS TO A POTENTIAL 'next' PAGE.
    while still_more_pages_to_call:     
        call_df = pd.DataFrame()
        if nth_call_counter ==1: 
            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : '*' 
            }
            nth_call_counter +=1
        else:
            nth_call_counter +=1
            cursor_next_hash = r_json['cursor']['@next']
            print("cursor next hash value: {}".format(cursor_next_hash))
            call_query = {
                'httpAccept' : 'application/json',
                'query' : human_query_issn,
                'date' : human_query_date,
                'count' : '200',
                'cursor' : cursor_next_hash
            }
        

        # print(call_query)

        # THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (200 AT A TIME BY USING 'cursor/@next')
        r = requests.get('https://api.elsevier.com/content/search/scopus',
                        headers=req_headers,
                        params=call_query)
        print("---------------------------------------------")
        print("CALL {n} FOR {pub_name} AND {count} ARTICLES RETURNING STATUS CODE: {code}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter -1,
                                                                                    count=call_query['count'],
                                                                                    code=r.status_code))
        
        # print(r.json())
        r_json = r.json()['search-results']

        if r_json['cursor']['@current'] == r_json['cursor']['@next']:
            print("We have reached the end of the 'cursor-next' chain. breaking out of this pub")
            still_more_pages_to_call = False
            break
        else:
            print("Going to finish collecting this page and then there still at least one more to go.")

        # NOW WE BEGIN UNPACKING EACH BATCH OF 200 TO STORE IN A TEMP DF
        sc_query_used = r_json['link'][0]['@href']
        r_results = r_json['entry']
        print("{pub_name} CALL {n} FOR {count} ARTICLES FOUND RESULTS: {num}".format(pub_name=journal_name,
                                                                                    n=nth_call_counter,
                                                                                    count=call_query['count'],
                                                                                    num=len(r_results)))
        
        ## THIS DICTIONARY SHOULD BE STRUCTURED IN ORDER OF IMPORTANCE. IE. PULLING THE API ENDPOINT IS MORE IMPORTANT THAN PULLING THE DOI, WHICH IS MORE IMPORTANT THE TITLE ETC.
        article_obj_nav_dict = {
            'sc_abstract_api_endpoint' : '["link"][0]["@href"]',
            'doi' : "['prism:doi']",
            'sc_title' : "['dc:title']",
            'sc_issn' : "['prism:issn']",
            'sc_pub_name' : "['prism:publicationName']",
            'sc_pub_date' : "['prism:coverDate']",
            'sc_open_access_status' : "['openaccessFlag']",
            'sc_vol' : "['prism:volume']",
            'sc_issue' : "['prism:issueIdentifier']",
            'sc_page_range' : "['prism:pageRange']",
            'sc_human_url' : "['link'][3]['@href']"
        }

        for article_obj in r_results:
            # CONSTRUCT A TEMP_DF THAT CONTAINS A SINGLE ARTICLE THAT WILL BE APPENDED TO call_df
            temp_df = pd.DataFrame({
                'doi' : [None],
                'sc_title' : [None],
                'sc_issn' : [None],
                'sc_pub_name' : [None],
                'sc_vol' : [None],
                'sc_issue' : [None],
                'sc_page_range' : [None],
                'sc_abstract_api_endpoint' : [None],
                'sc_human_url' : [None]
            })
            for field in article_obj_nav_dict:
                # FOR EACH OF THE FIELDS IDENTIFIED ABOVE (article_obj_nav_dict), EXECUTE THE field population FUNCTION THAT WILL TRY TO ACCESS THAT FIELD ACCORDING TO THE GIVEN SUBSCRIPT. IF UNAVAILABLE, CONTINUE WITH NOTATION OF FAILURE
                temp_df = field_population(temp_df, article_obj, article_obj_nav_dict, field)

            # ADD THE ARTICLE TO call_df
            # print(temp_df)
            call_df = pd.concat([call_df, temp_df], ignore_index=True)

        #ADD TO THE call_df THE CONSTRUCTED API-ENDPOINT QUERY THAT WAS USED TO GENERATE ALL OF THE RESULTS (ARTICLES) FOR THIS call_df
        call_df['sc_query_used'] = sc_query_used

        # ADD THE 200-BATCH OF ARTICLES TO pub_df
        pub_df = pd.concat([pub_df, call_df], ignore_index=True)

        ### TIMER HERE TO ENSURE WE DON'T EXCEED THE SCOPUS API'S QUERY THROTTLE
        time.sleep(0.15)


    return pub_df

In [None]:
#### MASTER RUN BLOCK HERE. BE CAREFUL
collection_dict = {
    'JPE' : {
        'issn' : '00223808',
        'start_year' : '1990',
        'print_name' : 'Journal of Political Economy'
    },
    # 'QJE' : {
    #     'issn' : '00335533',
    #     'start_year' : '1990',
    #     'print_name' : 'Quarterly Journal of Economics'
    # }, 
    # 'AER' : {
    #     'issn' : '00028282',
    #     'start_year' : '1990',
    #     'print_name' : 'American Economic Review'
    # },
    # 'RST' : {
    #     'issn' : '00346527',
    #     'start_year' : '1990',
    #     'print_name' : 'Review of Economic Studies' 
    # },
    # 'ECN' : {
    #     'issn' : '00129682',
    #     'start_year' : '1990',
    #     'print_name' : 'Econometrica'
    # }
}

# This is the dictionary that we are going to use to access the publication-specific dfs
pub_dict = {}

for pub in collection_dict.keys():
    pub_code = pub
    pub_issn = collection_dict.get(pub).get('issn')
    pub_start_year = collection_dict.get(pub).get('start_year')
    pub_name = collection_dict.get(pub).get('print_name')

    exec('{}_df = publication_collect("{}", "{}","{}")'.format(pub_code, pub_issn, pub_start_year, pub_name))
    exec('pub_dict["{}"] = {}_df'.format(pub_code,pub_code))


# pub_dict


In [None]:
#Given a pub_code (see 'collection_dict' keys), this function identifies the publication-specific df generated by 'publication_collect', and generates two new dfs: 1) authors and abstract; and 2) the citations that the article of interest makes. It then merges these back on to the pub_df (1:m) and returns that updated pub_df. We ALWAYS merge on 'doi' because it is universal (so we can merge with other datasets later).
def abstract_references_collect(pub_code):
    pub_df = pub_dict.get(pub_code)

    abstract_query = {
        'httpAccept' : 'application/json'
    }

    articles = pub_df.head(10)[['doi', 'sc_abstract_api_endpoint']]

    authors_abstracts_df = pd.DataFrame()
    article_cites_df = pd.DataFrame()

    # WE ARE GOING TO GO THROUGH EVERY ARTICLE IN THIS PUBLICATION 
    for row in range(0, len(articles)):
        doi = articles.loc[row, 'doi']
        url = articles.loc[row, 'sc_abstract_api_endpoint']

        # CLEAR TEMP DFS
        aa_df_temp = pd.DataFrame()
        article_cites_df_temp = pd.DataFrame()
    

        abstract_r = requests.get(url, headers=req_headers, params=abstract_query)
        if abstract_r.status_code != 200:
            continue

        abstract_r_json = abstract_r.json()

        ###################################
        # WE DO AUTHORS AND ABSTRACTS FIRST
        ###################################

        authors_object = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
        abstract_text = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['head']['abstracts']

        sc_author_id_list = []
        sc_author_given_name_list = []
        sc_author_last_name_list = []
        sc_author_indexed_name_list = []
        sc_author_affil_id_list = []
        sc_author_affil_indexed_list =[]
        sc_funding_inst_id_list = []
        sc_funding_inst_name_list = []
        sc_funding_text_list = []

        # IF there is only a single author then this is a json/dict...
        if type(authors_object) == dict:
            aa_json = authors_object


            sc_author_id = aa_json['author'][0]['@auid']
            sc_author_given_name = aa_json['author'][0]['preferred-name']['ce:given-name']
            sc_author_last_name = aa_json['author'][0]['preferred-name']['ce:surname']
            sc_author_indexed_name = aa_json['author'][0]['preferred-name']['ce:indexed-name']

            sc_author_id_list.append(sc_author_id)
            sc_author_given_name_list.append(sc_author_given_name)
            sc_author_last_name_list.append(sc_author_last_name)
            sc_author_indexed_name_list.append(sc_author_indexed_name)

            # SOMETIMES AN AUTHOR HAS MULTIPLE AFFILIATIONS (EG NBER AND HARVARD). IN THIS CASE, WE CONCATENATE ALL OF THE AFFILIATIONS, DELIMITED BY '+' SYMBOLS AND STORE THEM AS A STRING (TO BE UNNESTED LATER)
            if type(aa_json['affiliation']['affiliation-id']) == list:
                multi_affil_list = []
                for obj in aa_json['affiliation']['affiliation-id']:
                    multi_affil_list.append(obj['@afid'])
                '+'.join(multi_affil_list)
            else:
                sc_author_affil_id = aa_json['affiliation']['affiliation-id']['@afid']
            sc_author_affil_indexed = aa_json['affiliation']['ce:source-text']

            sc_author_affil_id_list.append(sc_author_affil_id)
            sc_author_affil_indexed_list.append(sc_author_affil_indexed)
        #... but if there are multiple authors this is a list of jsons/dicts
        else:
            for author_affil in authors_object:
                aa_json = author_affil

                sc_author_id = aa_json['author'][0]['@auid']
                sc_author_given_name = aa_json['author'][0]['preferred-name']['ce:given-name']
                sc_author_last_name = aa_json['author'][0]['preferred-name']['ce:surname']
                sc_author_indexed_name = aa_json['author'][0]['preferred-name']['ce:indexed-name']

                sc_author_id_list.append(sc_author_id)
                sc_author_given_name_list.append(sc_author_given_name)
                sc_author_last_name_list.append(sc_author_last_name)
                sc_author_indexed_name_list.append(sc_author_indexed_name)

                if type(aa_json['affiliation']['affiliation-id']) == list:
                    multi_affil_list = []
                    for obj in aa_json['affiliation']['affiliation-id']:
                        multi_affil_list.append(obj['@afid'])
                    '+'.join(multi_affil_list)
                else:
                    sc_author_affil_id = aa_json['affiliation']['affiliation-id']['@afid']
                sc_author_affil_indexed = aa_json['affiliation']['ce:source-text']

                sc_author_affil_id_list.append(sc_author_affil_id)
                sc_author_affil_indexed_list.append(sc_author_affil_indexed)


        aa_df_temp = pd.DataFrame({
            'doi' : doi,
            'abstract' : abstract_text,
            'sc_author_id' : sc_author_id_list,
            'sc_author_given_name' : sc_author_given_name_list,
            'sc_author_last_name' : sc_author_last_name_list,
            'sc_author_indexed_name' : sc_author_indexed_name_list,
            'sc_author_affil_id' : sc_author_affil_id_list,
            'sc_author_affil_indexed' : sc_author_affil_indexed,
            'sc_funding_inst_id' : sc_funding_inst_id_list,
            'sc_funding_inst_name' : sc_funding_inst_name_list,
            'sc_funding_text'  : sc_funding_text_list 
        })


        #################################
        # WE DO REFERENCES AND CITES NEXT
        #################################

        references_object = abstract_r_json['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']

        print(references_object['@refcount'])
        references_list = references_object['reference']

        sc_article_cites_scopus_group_id_list = []
        sc_article_cites_api_endpoint_list = []

        for ref in references_list:
            # The bibliography returns itemids of scopus-group id types rather than scopus id type. I don't think it should matter long-run because the articles can still be accessed by the 'https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}' API. For more info see: https://silo.tips/download/sciverse-scopus-custom-data-documentation page 59.

            sc_article_cites_scopus_group_id = ref['ref-info']['refd-itemidlist']['itemid']['$']
            sc_article_cites_api_endpoint = 'https://api.elsevier.com/content/abstract/scopus_id/{}'.format(sc_article_cites_scopus_group_id)

            sc_article_cites_scopus_group_id_list.append(sc_article_cites_scopus_group_id)
            sc_article_cites_api_endpoint_list.append(sc_article_cites_api_endpoint)

        article_cites_df_temp = pd.DataFrame({
            'doi' : doi,
            'sc_article_cites_scopus_group_id' : sc_article_cites_scopus_group_id_list,
            'sc_article_cites_api_endpoint_list' : sc_article_cites_api_endpoint
        })


        

        # CONCATENATE THE ARTICLES AUTHORS/ABSTRACTS AND REFERENCES TO THE PUBLICATION-LEVEL DFS, RESPECTIVELY
        authors_abstracts_df = pd.concat([authors_abstracts_df, aa_df_temp], ignore_index=True)
        article_cites_df = pd.concat([article_cites_df, article_cites_df_temp], ignore_index=True)


        time.sleep(0.15)
    
    # print(authors_abstracts_df)
    # print(article_cites_df)
    
    # MERGE THE AUTHORS/ABSTRACTS AND REFERENCES DATA ONTO THE THE CORE ARTICLE DATA (AT THE pub_df LEVEL)
    author_abstract_funding_df = pd.merge(pub_df, authors_abstracts_df, how='left', on='doi')
    cites_df = pd.merge(pub_df, article_cites_df, how='left', on='doi')

    return author_abstract_funding_df, cites_df
        

    

jpe_author_abstract_funding_df, jpe_cites_df = abstract_references_collect("JPE")



In [363]:
# https://api.elsevier.com/content/search/scopus?cursor=AoJR%2FoJNMjItczIuMC04NTA2NDEzMTMxNg%3D%3D&count=200&query=ISSN%2800223808%29&date=1990-2021
# https://api.elsevier.com/content/abstract/scopus_id/85117518428

# test_hash = 'AoJV7PRLMTItczIuMC0wMDI1NTg2MTMy'

# human_query_issn = 'ISSN({issn_str})'.format(issn_str='00223808')
# human_query_date = '{start_year_str}-{current_year_str}'.format(start_year_str=1990,
#                                                                     current_year_str=2021)


# call_query = {
#                 'httpAccept' : 'application/json',
#                 'query' : human_query_issn,
#                 'date' : human_query_date,
#                 'count' : '10',
#                 'cursor' : '*',
#                 'view' : 'COMPLETE'
#             }
        
## Grants? : https://api.elsevier.com/content/abstract/doi/10.1086/708815
## Funds? : https://api.elsevier.com/content/abstract/scopus_id/85117518428


# THIS IS WHERE WE CALL THE API TO GET THE LIST OF ARTICLES (200 AT A TIME BY USING 'cursor/@next')
test = requests.get('https://api.elsevier.com/content/abstract/doi/10.1086/717891',
                headers=req_headers,
                params= {
                    'httpAccept' : 'application/json',
                    'view' : 'FULL',
                    'field' : 'dc:title,item,prism:doi,authors,xocs:meta,dc:description'
                },
                # params=call_query
                )


# test.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
test.json()



{'abstracts-retrieval-response': {'coredata': {'dc:description': '© 2022 The University of Chicago. All rights reserved.The vast majority of the pay inequality in organizations comes from differences in pay between employees and their bosses. But are employees aware of these pay disparities? Are employees demotivated by this inequality? To address these questions, we conducted a natural field experiment with a sample of 2,060 employees from a multibillion-dollar corporation in Southeast Asia. We document large misperceptions among employees about the salaries of their managers and smaller but still significant misperceptions of the salaries of their peers, and we show that these perceptions have a significant causal effect on the employees’ own behavior.',
   'dc:title': 'How Much Does Your Boss Make? The Effects of Salary Comparisons',
   'prism:doi': '10.1086/717891'},
  'authors': {'author': [{'ce:given-name': 'Zoë',
     'preferred-name': {'ce:given-name': 'Zoë',
      'ce:initials