In [102]:
import requests
import math
import pandas as pd
import re

In [None]:
# STEP 0: This is a configuration code-block. Add as many sub-dictionaries as necessary -- one for each program of interest.
instructions_dict = {
    # 'PERSONAL PROGRAM CODE' : {
    #     'program_name' : 'Program name as NBER uses',
    #     'program_api_start_url' : 'url is revealed in the way that the NBER site dynamically hydrates the page. Check the inspector and the XHR requests to see the format and substitute the program name for the program of interest.'
    # },
    'LABOR': {
        'program_name' : 'Labor Studies',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ALabor%20Studies&page=1&perPage=100&sortBy=alpha'
    },
    'FINANCE' : {
        'program_name' : 'Corporate Finance',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ACorporate%20Finance&page=1&perPage=100&sortBy=alpha'
    },
    'IO' : {
        'program_name' : 'Industrial Organization',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3AIndustrial%20Organization&page=1&perPage=100&sortBy=alpha'
    },
    'LAWECON' : {
        'program_name' : 'Law and Economics',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ALaw%20and%20Economics&page=1&perPage=50&sortBy=alpha'
    }
}
storage_dict = {}

In [None]:
# HELPER FUNCTION: Takes in a JSON/dict (API response) and returns the 'totalResults' field as an int.
# Note that 'totalResults' can be bigger than the number of results in this response object as the response object is capped to a max of 100 results. (Hence we iterate over subsequent pages later.)
def get_total_expected_scholars(json_response):
    return json_response.get('totalResults')


In [None]:
# HELPER FUNCTION: Takes in an existing df and a JSON/dict (API response). This function unpacks the JSON and reformats it as a pd.df It then concatenates the new df to the extant one. Returns the concatenated (most up-to-date) df
def response_to_df(input_df, json_response):
    api_results = json_response.get('results')
    temp_df = pd.DataFrame.from_records(api_results)
    output_df = pd.concat([input_df, temp_df], ignore_index=True)
    return output_df

In [None]:
# HELPER FUNCTION: Given a URL and an existing df of scholars, this function queries the URL (an API-endpoint) and returns an updated df of scholars.
def get_scholars_by_api(url, existing_df):
    response = requests.get(url)
    if response.status_code == 200:
        response_json = response.json()
        new_df = response_to_df(existing_df, response_json)

        try:
            print("\tSUCCESS")
            return new_df
        except Exception as e:
            e.raiseExceptions
    else:   
        print('FAILING OUT JSON RESPONSE FAILED ON: {}'.format(url))

In [None]:
# STEP 1: For each program identified in `instructions_dict`, this loop identifies the number of affiliated scholars and generates a program-level df with scholar-level observations.

for program in instructions_dict.keys():
    print('WORKING ON: {}'.format(program))
    program_name = instructions_dict.get(program).get('program_name')
    program_api_start_url = instructions_dict.get(program).get('program_api_start_url')

    # A. Check if the API has responded correctly.
    init_response = requests.get(program_api_start_url)
    init_response_json = init_response.json()
    if init_response.status_code == 200:
        # If so, figure out if we will need to make multiple calls to collect all scholars (each call has a max. 100 scholars in the response)
        expected_total_scholars = get_total_expected_scholars(init_response_json)
        print('NUMBER OF SCHOLARS IDENTIFIED AS BEING AFFILIATED WITH {} PROGRAM: {}'.format(program, expected_total_scholars))
    else:
        # If not, throw an error. 
        print("FAILING OUT JSON RESPONSE FAILED ON: {}".format(program))

    # B. For each page (of length=100) format the API response and concatenate the results to the existing df of scholars. Keep doing this until we have all scholars (length of the temp_df is no less than the exepected number of scholars). Store the resulting program-level df.
    exec('{}_df = response_to_df(pd.DataFrame(), init_response_json)'.format(program))
    page_number = 1
    expected_total_pages = int(math.ceil(expected_total_scholars / 100))
    print('TOTAL PAGES TO CALL FOR {} :{}\nSUCCESS ON CALLING PAGE 1'.format(program, expected_total_pages))
    exec('current_df_length = len({}_df)'.format(program))
    while current_df_length < expected_total_scholars: 
        page_number += 1 
        new_url = re.sub(r'&page=\d', r'&page='+str(page_number), program_api_start_url)
        print('ATTEMPTING TO CALL ON {} PAGE {this}/{total}'.format(program, this=page_number, total=expected_total_pages))
        exec('{}_df = get_scholars_by_api(new_url,{}_df)'.format(program, program))
        exec('current_df_length = len({}_df)'.format(program))

    exec('storage_dict["{}"] = {}_df'.format(program, program))
    

        

In [None]:
# STEP 2: Do some clean up and hen concatenate each of the program-level dfs into a single, large scholar-level df with scholar-level observations to subsequent use.

complete_df = pd.DataFrame()
for program in storage_dict.keys():
    print('CONCATENATING DF FOR {}'.format(program))
    temp_df = storage_dict.get(program)
    complete_df = pd.concat([complete_df, temp_df], ignore_index=True)

# A. For each scholar, convert the link-formatted programs that each scholar is affiliated with, into a list of readable strings (non-link-format).
for row in complete_df.index.tolist():
    program_string = str(complete_df.loc[row, 'programs'])


    matches = re.finditer(r'(?<=\">).*?(?=</a>)', program_string)
    program_match_list =[]
    for match in matches:
        program_match_list.append(match.group())

    complete_df.at[row, 'programs'] = program_match_list

# B. Identify all of the programs that we want to keep. (I.e. we don't want an observation of Scholar X in Program Alpha if we are only interested in Program Beta even if Scholar X is in Programs Alpha and Beta)
programs_of_interest_list = []
for program in instructions_dict.keys():
    program_name = instructions_dict.get(program).get('program_name')
    programs_of_interest_list.append(program_name)

# Filter out the scholar-program observations that are of programs that we have no interest in
complete_df =  complete_df.explode('programs', ignore_index=True)
complete_df.reset_index(inplace=True)

complete_df = complete_df[complete_df['programs'].isin(programs_of_interest_list)]
complete_df = complete_df.drop_duplicates(subset=['id', 'programs'])
complete_df = complete_df.rename(columns={
    'title' : 'name'
})
# C. Output to .csv
complete_df.to_csv('nber_affiliated_scholars.csv', index=False, encoding='utf-8')