In [33]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import re
import math
import time
import random
from PyPDF2 import PdfFileReader
import os

In [None]:
# STEP 0: Read in a df of scholar-level observations
scholars_df = pd.read_csv('nber_scholars/nber_scholars.csv' index_col=None)
scholars_df = scholars_df.drop(columns=['index'])

In [2]:
# STEP 0: Generate a list of available proxies. (Order is scrambled to prevent other people using them)
proxies_list_scrambled = open('proxies.txt', 'r').readlines()
proxies_list_split = list(map(lambda x: x.split(':'), proxies_list_scrambled))
proxies_list_cleaned = list(map(lambda x: 'http://{}:{}@{}:{}'.format(x[2],x[3], x[0], x[1]), proxies_list_split))

In [None]:
# HELPER FUNCTION: Takes in a JSON/dict (API response) and returns the 'totalResults' field as an int.
# Note that 'totalResults' can be bigger than the number of results in this response object as the response object is capped to a max of 100 results (Hence we iterate over subsequent pages later.)
def get_total_expected_papers(response_json):
    total_expected_papers = response_json.get('totalResults')
    return total_expected_papers

In [None]:
# HELPER FUNCTION: Takes in an existing df and a JSON/dict (API response). This function unpacks the JSON and reformats it as a pd.df. It then concatenates the new df to the extant one. Returns the concatenated (most up-to-date) df.
def response_to_df(input_df, json_response):
    api_results = json_response.get('results')
    temp_df = pd.DataFrame.from_records(api_results)
    output_df = pd.concat([input_df, temp_df], ignore_index=True)
    return output_df

In [None]:
# HELPER FUNCTION: Given a URL and an existing df of papers, this function queries the URL (an API-endpoint) and returns an updated df of scholars.
def get_papers_by_api(url, existing_df):
    response = requests.get(url, proxies=generate_random_proxy_tunnel())
    if response.status_code == 200:
        response_json = response.json()
        new_df = response_to_df(existing_df, response_json)

        try:
            print("\tSUCCESS")
            return new_df
        except Exception as e:
            e.raiseExceptions
    else:
        print('FAILING OUT JSON RESPONSE FAILED ON: {}'.format(url))

In [4]:
# HELPER FUNCTION: Before actually making a call (to the API or download) randomly select of the 100 proxies generated above. This reduces the site's ability to track our downloading/calling behavior. All internet traffic (for this query/download) is passed through this proxy as an intermeidary.
def generate_random_proxy_tunnel():
    ### IT APPEARS THAT ONLY THE FIRST 100 PROXY SERVERS ARE WORKING?
    rand_proxy = proxies_list_cleaned[np.random.choice(np.arange(1,100))]
    proxy_obj = {'https' : rand_proxy}
    return proxy_obj
    

In [None]:
# STEP 1: For each scholar in scholars_df that we read in, we generate a temporary scholar-level df with paper-level observations. Those scholar-level papers are then concatenated to get a complete paper-level df.
api_call_template_url = 'https://www.nber.org/api/v1/generic_listing/uid/{}/contentType,contentType,contentType,contentType,contentType,contentType,contentType,contentType/working_paper,book,chapter,dataset,interview,lecture,center_paper,article/search?page=1&perPage=100'

all_papers_df = pd.DataFrame()
for row in scholars_df.index.tolist():  
    scholar_df = pd.DataFrame()
    scholar_name = scholars_df.loc[row, 'name']
    scholar_uid = scholars_df.loc[row, 'id']
    scholar_effective_id = re.search(r'\d+', scholars_df.loc[row, 'id']).group()

    api_call_url = api_call_template_url.format(scholar_effective_id)
    # A. Check if the API has responded correctly (through the proxy)
    try:
        random_proxy = generate_random_proxy_tunnel()
        init_response = requests.get(api_call_url, proxies=random_proxy)
    except:
        print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))

    init_response_json = init_response.json()
    if init_response.status_code == 200:
        # If so, figure out if we will need to make multiple calls to collect all papers (for a single scholar.) Each call has a max. 100 papers in the response.
        expected_total_papers = get_total_expected_papers(init_response_json)
        print('NUMBER OF TOTAL WORK PRODUCTS AS BEING AFFILIATED WITH {}: {}'.format(scholar_name.upper(), expected_total_papers))
    else:
        # If not, throw an error.
        print("FAILING OUT JSON RESPONSE FAILED ON: {}".format(api_call_url))

    # B. for each page (of length=100) format the API response and cocatenate the results to the existing scholar-level df of papers. Keep doing this until we have all papers (length of scholar_df is no less than the expected number of scholars). Store the resulting scholar-level df 
    scholar_df = response_to_df(scholar_df, init_response_json)

    page_number = 1
    expected_total_pages = int(math.ceil(expected_total_papers / 100))
    while len(scholar_df) < expected_total_papers:
        page_number += 1
        new_url = re.sub(r'page=\d', r'page=' + str(page_number), api_call_url)
        print('ATTEMPTING TO CALL ON {} PAGE {this}/{total}'.format(scholar_name.upper(), this=page_number, total=expected_total_pages))
        scholar_df = get_papers_by_api(new_url, scholar_df)


    scholar_df['assigned_author'] = scholar_name
    scholar_df['assigned_author_uid'] = scholar_uid
    random_sleep_interval = random.randint(1,10)
    # C. Concatenate the scholar-level df to the paper-level df of all papers. 
    all_papers_df = pd.concat([all_papers_df, scholar_df], ignore_index=True)
    # print('SLEEPING FOR {} SECONDS'.format(random_sleep_interval))
    time.sleep(random_sleep_interval/10)
    print('COMPLETED {this}/{total} SCHOLARS (NB some scholars may require multiple calls)'.format(this=row, total=len(scholars_df)))



print("NUMBER OF ALL NBER WORK PRODUCTS IDENTIFIED: {}".format(len(all_papers_df)))
# D. Drop all of the non-working paper observations (lectures, videos, blog posts etc.)
all_papers_df = all_papers_df[all_papers_df.displaytypename == 'Working Paper']
print("NUMBER OF ALL NBER WORKING PAPERS IDENTIFIED: {}".format(len(all_papers_df)))


In [8]:
# STEP 2: Modify the paper-level df a little bit to get ready for adding some supplementary data that will be scraped from the paper-level web page (the abstract, disclosures, doi, issue_date etc.)
### SET UP BEFORE ADVANCING TO DOWNLOADING PAPERS: 
# all_papers_df = pd.read_csv('nber_scholars/nber_affiliated_scholar_paper.csv')
all_papers_df['page_url'] = None
all_papers_df['pdf_url'] = None

all_papers_df['abstract'] = None
all_papers_df['disclosures'] = None
all_papers_df['doi'] = None
all_papers_df['issue_date'] = None


try:
    all_papers_df.reset_index(inplace=True)
except:
    print("Index has already been rest... moving on")
    
all_papers_df.to_csv('nber_scholars/nber_affiliated_scholar_paper.csv', index=False, encoding='utf-8')


In [3]:
# HELPER FUNCTION: See if this page has been scraped once before (to account for co-authorships, multi-program authors, etc.) If so, return the already-recorded observation. If not, proceed.
def check_for_previous_collection(page_url, df, row):
    sub_df = df.head(row)
    if page_url in sub_df.page_url.tolist():
        return True, df.loc[row, :]
    else:
        return False, None

In [None]:
# STEP 3: Adding additional data to the paper-level df. Some of the info is only available on the page itself (and not through the API). This includes the paper's doi, the abstract, some acknowledgements/disclosure data (different from the first-footnote thanks disclosure)
all_papers_df = pd.read_csv('nber_scholars/nber_affiliated_scholar_paper.csv')

# IF WE HAVE A FAILURE PART WAY THROUGH USE THE FOLLOWING READ INSTEAD OF THE ABOVE:
# all_papers_df = pd.read_csv('all_papers_df_temp_backup.csv')



### TODO need to fix the logic in this part of the loop for already-collected papers (need a better indicator of already-collected.)
for row in all_papers_df.index.tolist():
    print(row)
    # if all_papers_df.loc[row, 'page_url'] is not None:
        
    #     print('a)ALREADY DOWNLOADED ROW {}.... PROCEEDING'.format(row))
    #     print(all_papers_df.loc[row, 'page_url'])
    #     continue
    if type(all_papers_df.loc[row, 'page_url']) is str:
        print('b) ALREADY DOWNLOADED ROW {}.... PROCEEDING'.format(row))
        continue
    else:
        print('HAVE YET TO DO DOWNLOAD FOR ROW {}'.format(row))
        # print(all_papers_df.loc[row, :])

    url = all_papers_df.loc[row, 'url']
    print(url)

    working_paper_code = re.search(r'(w|t|h)\d+', url).group()
    page_url = 'https://www.nber.org' + url
    pdf_url = 'https://www.nber.org/system/files/working_papers/{}/{}.pdf'.format(working_paper_code, working_paper_code)

    all_papers_df.loc[row, 'page_url'] = page_url
    all_papers_df.loc[row, 'pdf_url'] = pdf_url


    previously_collected_status, observation =  check_for_previous_collection(page_url, all_papers_df, row)
    if previously_collected_status:
        print('THIS PAPER HAS ALREADY BEEN COLLECTED ONCE BEFORE')
        all_papers_df.loc[row, 'abstract'] = observation.abstract
        all_papers_df.loc[row, 'disclosures'] = observation.disclosures
        all_papers_df.loc[row, 'doi'] = observation.doi
        all_papers_df.loc[row, 'issue_date'] = observation.issue_date
        all_papers_df.loc[row, 'abstract'] = observation.abstract


    else:
        print('else')

        # 1. first collect info from the page
        
        try:
            random_proxy = generate_random_proxy_tunnel()
            page = requests.get(page_url, proxies=random_proxy)
        except:
            print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))

        bs_content = bs(page.content, 'html')

        abstract = bs_content.find('div', class_='page-header__intro-inner').text
        disclosure = bs_content.find('div', id='accordion-body-guid1').text
        citation_items = bs_content.find_all('div', class_='page-header__citation-item')
        for item in citation_items:
            if re.search(r'(?<=DOI )(.*)', item.text):
                doi = re.search(r'(?<=DOI )(.*)', item.text).group()

            elif re.search(r'(?<=Issue Date )(.*)', item.text): 
                issue_date = re.search(r'(?<=Issue Date )(.*)', item.text).group()

        all_papers_df.loc[row, 'abstract'] = abstract
        all_papers_df.loc[row, 'disclosures'] = disclosure
        all_papers_df.loc[row, 'doi'] = doi
        all_papers_df.loc[row, 'issue_date'] = issue_date

        # 2. Then download the page itself
        
        path_to_save = 'nber_working_papers/' + working_paper_code + '.pdf'
        with open(path_to_save, 'wb') as file:

            try:
                random_proxy = generate_random_proxy_tunnel()
                pdf = requests.get(pdf_url, proxies=random_proxy)
                file.write(pdf.content)
            except:
                print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))
    if row % 100 == 0:
        all_papers_df.to_csv('all_papers_df_temp_backup.csv', index=False, encoding='utf-8')

all_papers_df.to_csv('nber_scholars/nber_affiliated_scholar_paper.csv', index=False, encoding='utf-8')

In [32]:
# STEP 4: A final attempt at cleaning up any failed downloads. For every downloaded paper that is already stored locally, we try to open it. If the paper does not successfully open, then we try to download it again.




print('NUMBER OF IDENTIFIED LOCAL .PDF FILES TO CHECK: {}'.format(len(os.listdir('nber_working_papers'))))

nber_working_papers = os.listdir('nber_working_papers')

# A. For every paper in the local downloads folder
for paper in nber_working_papers:
    path = 'nber_working_papers//' + paper
    paper_id = re.search(r'(h|t|w)\d+', path).group()
    pdf_url = 'https://nber.org/system/files/working_papers/{}/'.format(paper_id) + paper



    # Try to read it in as a .pdf.
    try:
        PdfFileReader(path)
    # If it fails then try to download it again.
    except Exception as e:
        print('FAILED ON FILE:{}'.format(path))
        print(e)
        with open(path, 'wb') as file:
            random_proxy = generate_random_proxy_tunnel()
            pdf = requests.get(pdf_url, proxies=random_proxy)
            file.write(pdf.content)

    
### If the traceback error returns "PDF starts with '<!DOC', but '%PDF-' expected" then this means that the PDF has been removed from the NBER page (this can be due to a retraction or because of copyright agreements with the paper's eventual publisher). We build in additional error handling when trying to read the .pdf files/


8565




FAILED ON FILE:nber_working_papers//w12941.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w14094.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w23193.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28159.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28162.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28189.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28202.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28226.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28229.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28236.pdf
PDF starts with '<!DOC', but '%PDF-' expected
FAILED ON FILE:nber_working_papers//w28249.pdf
PDF starts with '<!DOC'