In [1]:
''' 
---------------------------------------
# Using the American Economic Association's EconLit database (provided through EBSCOHost),
# this script scrapes metadata on all economics journal articles back to a given year.
# I have two plans for this data
# 1) 
---------------------------------------
'''

In [1]:
# Here's what I'm going to need:

import time
import sqlite3
import pandas as pd
from sqlite3 import Error
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import re

import concurrent.futures




In [2]:
'''Initialize the browser and open the first page'''


sr_browser = webdriver.PhantomJS(executable_path = "C:/Users/240-370-8956/Desktop/STK/phantomjs-2.1.1-windows/phantomjs-2.1.1-windows/bin/phantomjs.exe")
sr_browser.set_window_size(1920, 1080)

'''
---------------------------------------
# This next line uses a test search of all journal articles from 2019-01-01 to 2019-12-31
# On EconLit, design your search (you can use SQL or their interface), and run it 
# Then, on the share tab, get a shareable link and start there
# I can't use the URL of the search because UMD requires dual authentication.
# If your university doesn't, you can go with the URL of the search
---------------------------------------
'''

sr_browser.get("http://search.ebscohost.com.proxy-um.researchport.umd.edu/login.aspx?direct=true&db=ecn&bquery=&cli0=DT1&clv0=201901-201912&cli1=PT50&clv1=Journal+Article&type=1&searchMode=And&site=ehost-live")

username = sr_browser.find_element_by_id("username")
password = sr_browser.find_element_by_id("password")


'''Have to remember to redact these before I post on GitHub'''
username.send_keys("jablevin")
password.send_keys("MehUMjkd.2")

sr_browser.find_element_by_name("_eventId_proceed").click()





'''UMD REQUIRES DUAL AUTHENTICATION AT THIS POINT; NO WAY AROUND THAT. ONLY HAS TO BE DONE ONCE'''




'UMD REQUIRES DUAL AUTHENTICATION AT THIS POINT; NO WAY AROUND THAT. ONLY HAS TO BE DONE ONCE'

In [3]:
'''Finds and selects the first result from the targeted search'''

sr_browser.find_element_by_name("Result_1").click()

# Test 1
#soup1 = BeautifulSoup(sr_browser.page_source)

#print(soup3)

In [4]:
# This is going to find and click the "Next Page" button

def load_next_page():
    try:
        next_btn = sr_browser.find_element_by_id(
            'ctl00_ctl00_MainContentArea_MainContentArea_topNavControl_btnNext'
            )
        next_btn.click()
    except NoSuchElementException:
        if sr_browser.current_url == 'about:blank':
            print('Please check your internet connection.')
        else:
            sr_browser.save_screenshot('error.png')
            print('The page is not as expected, screenshot saved.')
            print('url: {}'.format(sr_browser.current_url))
        raise

In [5]:
# create empty dataframe to store the article metadata in

articles = pd.DataFrame()

In [7]:
# ---------------------------------------
# The following while loop parses the articles page using BeautifulSoup, gets what I need, and puts in in a dataframe
# This runs at about 3 seconds per page, and because there are 30-50k articles per year, that's not really sustainable
# Most of the slowdown is the load_next_page function, I suspect because my wifi sucks and because PhantomJS isn't meant for this
# Still, at one second per page (optimistic), this will take 11 hours per year. So don't start this unless you got time
# ---------------------------------------

n = 10


while n > 0: 
    # within an EBSCOHost page, pulls HTML page source using BeautifulSoup 
    soup = BeautifulSoup(sr_browser.page_source)

    dt_tags = soup.find_all('dt', attrs = {'data-auto': 'citation_field_label'})
    field_names = [tag.string[:-1] for tag in dt_tags]

    dd_tags = soup.find_all('dd', attrs = {'data-auto': 'citation_field_value'})
    field_values = [tag.text for tag in dd_tags]

    pagedict = dict(zip(field_names, field_values))

    
    
    # ---------------------------------------
    # These are the fields I need:
    # Accession number is a unique EconLit identifier
    # Author is a field with a list of authors separated by semicolons
    # Author Affiliation is the affiliation of the above authors, respectively; also semicolon separated
    # Descriptors is the JEL codes associated with the paper, semicolon seperated
    # Keywords are decided by the authors; also semicolon separated
    # ---------------------------------------
    needed_fields = ['Accession Number', 'Author', 'Author Affiliation', 'Source', 'Title', 'Descriptors', 'Keywords']
    for field in needed_fields:
        if field not in pagedict.keys():
            pagedict[field] = ''

    # Recognizes the relevant characters in the source code to grab them
    re_pattern = r'\([A-Z]\d\d\)'
    descriptors_list = [item[1:-1] for item in re.findall(re_pattern, pagedict['Descriptors'])]
    descriptors = '; '.join(descriptors_list)
    keywords = ' '.join(pagedict['Keywords'].split())

    finaldict = {'an': pagedict['Accession Number'],
                 'author': pagedict['Author'],
                 'aff': pagedict['Author Affiliation'],
                 'source': pagedict['Source'],
                 'title': pagedict['Title'],
                 'jel': descriptors,
                 'keywords': keywords
                 }

    #print(finaldict)
    
    #add finaldict to dataframe 
    articles = articles.append(pd.DataFrame(finaldict, index = [0]), ignore_index = True)

    # Go to the next page 
    load_next_page()
    
    
    n -= 1

In [8]:
#articles.head()

In [23]:
# Exports the dataframe to CSV for analysis in Stata

articles.to_csv(r'C:\Users\240-370-8956\Dropbox\Research\JEL_codes\Data\articles.csv', index = False)
