In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import re
import math
import time
import random

In [None]:
scholars_df = pd.read_csv('nber_affiliated_scholars.csv', index_col=None)
scholars_df = scholars_df.drop(columns=['index'])

In [2]:
proxies_list_scrambled = open('proxies.txt', 'r').readlines()
proxies_list_split = list(map(lambda x: x.split(':'), proxies_list_scrambled))
proxies_list_cleaned = list(map(lambda x: 'http://{}:{}@{}:{}'.format(x[2],x[3], x[0], x[1]), proxies_list_split))

In [None]:
def get_total_expected_papers(response_json):
    total_expected_papers = response_json.get('totalResults')
    return total_expected_papers

In [None]:
def response_to_df(input_df, json_response):
    api_results = json_response.get('results')
    temp_df = pd.DataFrame.from_records(api_results)
    output_df = pd.concat([input_df, temp_df], ignore_index=True)
    return output_df

In [None]:
def get_papers_by_api(url, existing_df):
    response = requests.get(url, proxies=generate_random_proxy_tunnel())
    if response.status_code == 200:
        response_json = response.json()
        new_df = response_to_df(existing_df, response_json)

        try:
            print("\tSUCCESS")
            return new_df
        except Exception as e:
            e.raiseExceptions
    else:
        print('FAILING OUT JSON RESPONSE FAILED ON: {}'.format(url))

In [4]:
def generate_random_proxy_tunnel():
    ### IT APPEARS THAT ONLY THE FIRST 100 PROXY SERVERS ARE WORKING?
    rand_proxy = proxies_list_cleaned[np.random.choice(np.arange(1,100))]
    proxy_obj = {'https' : rand_proxy}
    return proxy_obj
    

In [None]:
api_call_template_url = 'https://www.nber.org/api/v1/generic_listing/uid/{}/contentType,contentType,contentType,contentType,contentType,contentType,contentType,contentType/working_paper,book,chapter,dataset,interview,lecture,center_paper,article/search?page=1&perPage=100'

all_papers_df = pd.DataFrame()
for row in scholars_df.index.tolist():  
    scholar_df = pd.DataFrame()
    scholar_name = scholars_df.loc[row, 'name']
    scholar_uid = scholars_df.loc[row, 'id']
    scholar_effective_id = re.search(r'\d+', scholars_df.loc[row, 'id']).group()

    api_call_url = api_call_template_url.format(scholar_effective_id)

    try:
        random_proxy = generate_random_proxy_tunnel()
        init_response = requests.get(api_call_url, proxies=random_proxy)
    except:
        print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))

    init_response_json = init_response.json()
    if init_response.status_code == 200:
        expected_total_papers = get_total_expected_papers(init_response_json)
        print('NUMBER OF TOTAL WORK PRODUCTS AS BEING AFFILIATED WITH {}: {}'.format(scholar_name.upper(), expected_total_papers))
    else:
        print("FAILING OUT JSON RESPONSE FAILED ON: {}".format(api_call_url))

    scholar_df = response_to_df(scholar_df, init_response_json)

    page_number = 1
    expected_total_pages = int(math.ceil(expected_total_papers / 100))
    while len(scholar_df) < expected_total_papers:
        page_number += 1
        new_url = re.sub(r'page=\d', r'page=' + str(page_number), api_call_url)
        print('ATTEMPTING TO CALL ON {} PAGE {this}/{total}'.format(scholar_name.upper(), this=page_number, total=expected_total_pages))
        scholar_df = get_papers_by_api(new_url, scholar_df)


    scholar_df['assigned_author'] = scholar_name
    scholar_df['assigned_author_uid'] = scholar_uid
    random_sleep_interval = random.randint(1,10)
    all_papers_df = pd.concat([all_papers_df, scholar_df], ignore_index=True)
    # print('SLEEPING FOR {} SECONDS'.format(random_sleep_interval))
    time.sleep(random_sleep_interval/10)
    print('COMPLETED {this}/{total} SCHOLARS (NB some scholars may require multiple calls)'.format(this=row, total=len(scholars_df)))



print("NUMBER OF ALL NBER WORK PRODUCTS IDENTIFIED: {}".format(len(all_papers_df)))
all_papers_df = all_papers_df[all_papers_df.displaytypename == 'Working Paper']
print("NUMBER OF ALL NBER WORKING PAPERS IDENTIFIED: {}".format(len(all_papers_df)))


In [8]:
### SET UP BEFORE ADVANCING TO DOWNLOADING PAPERS: 
all_papers_df = pd.read_csv('nber_affiliated_scholar_paper.csv')
all_papers_df['page_url'] = None
all_papers_df['pdf_url'] = None

all_papers_df['abstract'] = None
all_papers_df['disclosures'] = None
all_papers_df['doi'] = None
all_papers_df['issue_date'] = None

# all_papers_df.reset_index(inplace=True)
all_papers_df.to_csv('nber_affiliated_scholar_paper.csv', index=False, encoding='utf-8')


In [3]:
def check_for_previous_collection(page_url, df, row):
    sub_df = df.head(row)
    if page_url in sub_df.page_url.tolist():
        return True, df.loc[row, :]
    else:
        return False, None

In [10]:
all_papers_df = pd.read_csv('nber_affiliated_scholar_paper.csv')
# IF WE HAVE A FAILURE PART WAY THROUGH USE THE FOLLOWING READ INSTEAD OF THE ABOVE:
# all_papers_df = pd.read_csv('all_papers_df_temp_backup.csv')

for row in all_papers_df.index.tolist():
    print(row)
    # if all_papers_df.loc[row, 'page_url'] is not None:
        
    #     print('a)ALREADY DOWNLOADED ROW {}.... PROCEEDING'.format(row))
    #     print(all_papers_df.loc[row, 'page_url'])
    #     continue
    if type(all_papers_df.loc[row, 'page_url']) is str:
        print('b) ALREADY DOWNLOADED ROW {}.... PROCEEDING'.format(row))
        continue
    else:
        print('HAVE YET TO DO DOWNLOAD FOR ROW {}'.format(row))
        # print(all_papers_df.loc[row, :])

    url = all_papers_df.loc[row, 'url']
    print(url)

    working_paper_code = re.search(r'(w|t|h)\d+', url).group()
    page_url = 'https://www.nber.org' + url
    pdf_url = 'https://www.nber.org/system/files/working_papers/{}/{}.pdf'.format(working_paper_code, working_paper_code)

    all_papers_df.loc[row, 'page_url'] = page_url
    all_papers_df.loc[row, 'pdf_url'] = pdf_url


    previously_collected_status, observation =  check_for_previous_collection(page_url, all_papers_df, row)
    if previously_collected_status:
        print('THIS PAPER HAS ALREADY BEEN COLLECTED ONCE BEFORE')
        all_papers_df.loc[row, 'abstract'] = observation.abstract
        all_papers_df.loc[row, 'disclosures'] = observation.disclosures
        all_papers_df.loc[row, 'doi'] = observation.doi
        all_papers_df.loc[row, 'issue_date'] = observation.issue_date
        all_papers_df.loc[row, 'abstract'] = observation.abstract


    else:
        print('else')

        # 1. first collect info from the page
        
        try:
            random_proxy = generate_random_proxy_tunnel()
            page = requests.get(page_url, proxies=random_proxy)
        except:
            print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))

        bs_content = bs(page.content, 'html')

        abstract = bs_content.find('div', class_='page-header__intro-inner').text
        disclosure = bs_content.find('div', id='accordion-body-guid1').text
        citation_items = bs_content.find_all('div', class_='page-header__citation-item')
        for item in citation_items:
            if re.search(r'(?<=DOI )(.*)', item.text):
                doi = re.search(r'(?<=DOI )(.*)', item.text).group()

            elif re.search(r'(?<=Issue Date )(.*)', item.text): 
                issue_date = re.search(r'(?<=Issue Date )(.*)', item.text).group()

        all_papers_df.loc[row, 'abstract'] = abstract
        all_papers_df.loc[row, 'disclosures'] = disclosure
        all_papers_df.loc[row, 'doi'] = doi
        all_papers_df.loc[row, 'issue_date'] = issue_date

        # 2. Then download the page itself
        
        path_to_save = 'nber_working_papers/' + working_paper_code + '.pdf'
        with open(path_to_save, 'wb') as file:

            try:
                random_proxy = generate_random_proxy_tunnel()
                pdf = requests.get(pdf_url, proxies=random_proxy)
                file.write(pdf.content)
            except:
                print('FAILED A REQUEST WHEN USING PROXY SERVER: {}'.format(random_proxy))
    if row % 100 == 0:
        all_papers_df.to_csv('all_papers_df_temp_backup.csv', index=False, encoding='utf-8')

all_papers_df.to_csv('nber_affiliated_scholar_paper.csv', index=False, encoding='utf-8')

0
HAVE YET TO DO DOWNLOAD FOR ROW 0
/papers/w24403
else
1
HAVE YET TO DO DOWNLOAD FOR ROW 1
/papers/w24003
else
2
HAVE YET TO DO DOWNLOAD FOR ROW 2
/papers/w20325
else
3
HAVE YET TO DO DOWNLOAD FOR ROW 3
/papers/w19742
else
4
HAVE YET TO DO DOWNLOAD FOR ROW 4
/papers/w17442
else
5
HAVE YET TO DO DOWNLOAD FOR ROW 5
/papers/w15301
else
6
HAVE YET TO DO DOWNLOAD FOR ROW 6
/papers/w14756
else
7
HAVE YET TO DO DOWNLOAD FOR ROW 7
/papers/t0335
else
8
HAVE YET TO DO DOWNLOAD FOR ROW 8
/papers/w12831
else
9
HAVE YET TO DO DOWNLOAD FOR ROW 9
/papers/w12678
else
10
HAVE YET TO DO DOWNLOAD FOR ROW 10
/papers/t0325
else
11
HAVE YET TO DO DOWNLOAD FOR ROW 11
/papers/w10859
else
12
HAVE YET TO DO DOWNLOAD FOR ROW 12
/papers/w10604
else
13
HAVE YET TO DO DOWNLOAD FOR ROW 13
/papers/t0283
else
14
HAVE YET TO DO DOWNLOAD FOR ROW 14
/papers/w8478
else
15
HAVE YET TO DO DOWNLOAD FOR ROW 15
/papers/t0260
else
16
HAVE YET TO DO DOWNLOAD FOR ROW 16
/papers/t0261
else
17
HAVE YET TO DO DOWNLOAD FOR ROW 17
/p

In [11]:
all_papers_df.to_csv('all_papers_df_temp_backup.csv', index=False, encoding='utf-8')