In [None]:
import requests
import math
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import re

In [None]:
instructions_dict = {
    'LABOR': {
        'program_name' : 'Labor Studies',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ALabor%20Studies&page=1&perPage=100&sortBy=alpha'
    },
    'FINANCE' : {
        'program_name' : 'Corporate Finance',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ACorporate%20Finance&page=1&perPage=100&sortBy=alpha'
    },
    'IO' : {
        'program_name' : 'Industrial Organization',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3AIndustrial%20Organization&page=1&perPage=100&sortBy=alpha'
    },
    'LAWECON' : {
        'program_name' : 'Law and Economics',
        'program_api_start_url' : 'https://www.nber.org/api/v1/scholar_listing/_/_/_/_/search?facet=user_programs%3ALaw%20and%20Economics&page=1&perPage=50&sortBy=alpha'
    }
}

In [None]:
storage_dict = {}

In [None]:
def get_total_expected_scholars(json_response):
    return json_response.get('totalResults')


In [None]:
def response_to_df(input_df, json_response):
    api_results = json_response.get('results')
    temp_df = pd.DataFrame.from_records(api_results)
    output_df = pd.concat([input_df, temp_df], ignore_index=True)
    return output_df

In [None]:
def get_scholars_by_api(url, existing_df):
    response = requests.get(url)
    if response.status_code == 200:
        response_json = response.json()
        new_df = response_to_df(existing_df, response_json)

        try:
            print("\tSUCCESS")
            return new_df
        except Exception as e:
            e.raiseExceptions
    else:   
        print('FAILING OUT JSON RESPONSE FAILED ON: {}'.format(url))

In [None]:
for program in instructions_dict.keys():
    print('WORKING ON: {}'.format(program))
    program_name = instructions_dict.get(program).get('program_name')
    program_api_start_url = instructions_dict.get(program).get('program_api_start_url')

    init_response = requests.get(program_api_start_url)
    init_response_json = init_response.json()
    if init_response.status_code == 200:
        expected_total_scholars = get_total_expected_scholars(init_response_json)
        print('NUMBER OF SCHOLARS IDENTIFIED AS BEING AFFILIATED WITH {} PROGRAM: {}'.format(program, expected_total_scholars))
    else:
        print("FAILING OUT JSON RESPONSE FAILED ON: {}".format(program))


    exec('{}_df = response_to_df(pd.DataFrame(), init_response_json)'.format(program))
    page_number = 1
    expected_total_pages = int(math.ceil(expected_total_scholars / 100))
    print('TOTAL PAGES TO CALL FOR {} :{}\nSUCCESS ON CALLING PAGE 1'.format(program, expected_total_pages))
    exec('current_df_length = len({}_df)'.format(program))
    while current_df_length < expected_total_scholars: 
        page_number += 1 
        new_url = re.sub(r'&page=\d', r'&page='+str(page_number), program_api_start_url)
        print('ATTEMPTING TO CALL ON {} PAGE {this}/{total}'.format(program, this=page_number, total=expected_total_pages))
        exec('{}_df = get_scholars_by_api(new_url,{}_df)'.format(program, program))
        exec('current_df_length = len({}_df)'.format(program))

    exec('storage_dict["{}"] = {}_df'.format(program, program))
    

        

In [97]:
complete_df = pd.DataFrame()
for program in storage_dict.keys():
    print('CONCATENATING DF FOR {}'.format(program))
    temp_df = storage_dict.get(program)
    complete_df = pd.concat([complete_df, temp_df], ignore_index=True)


for row in complete_df.index.tolist():
    program_string = str(complete_df.loc[row, 'programs'])

    matches = re.finditer(r'(?<=\">).*?(?=</a>)', program_string)
    program_match_list =[]
    for match in matches:
        program_match_list.append(match.group())

    complete_df.at[row, 'programs'] = program_match_list


programs_of_interest_list = []
for program in instructions_dict.keys():
    program_name = instructions_dict.get(program).get('program_name')
    programs_of_interest_list.append(program_name)


complete_df =  complete_df.explode('programs', ignore_index=True)
complete_df.reset_index(inplace=True)

complete_df = complete_df[complete_df['programs'].isin(programs_of_interest_list)]
complete_df = complete_df.rename(columns={
    'title' : 'name'
})

complete_df.to_csv('nber_affiliated_scholars.csv', index=False, encoding='utf8')

CONCATENATING DF FOR LABOR
CONCATENATING DF FOR FINANCE
CONCATENATING DF FOR IO
CONCATENATING DF FOR LAWECON
