In [1]:
from selenium import webdriver
from selenium.webdriver.common.proxy import *
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from openpyxl import Workbook, load_workbook
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

import os, urllib, re, time, random, winsound, sys, urllib.parse
import pandas as pd
from pathlib import Path
import numpy as np

# View all rows of a dataframe in Jupyter
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
macropaperspath = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\macro_papers\macro_papers.xlsx")
citationsdfpath = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\code\citations.csv")
startindexpath = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\code\startindex.csv")

In [12]:
# Captcha keyword list
keyword_list = ['Please show you\'re not a robot', 'recaptcha', 'your computer or network may be sending automated queries',
               'Our systems have detected unusual traffic from your computer']

# Captcha sound settings
Freq = 1500 # Set Frequency, e.g. 1500 or 2500 Hertz
Dur = 500 # Set Duration, 1000 ms == 1 second

# Pause for a variable number of seconds so that Google can't tell you're web scraping
# 5-10s is a good number because it's slow enough that you rarely get captchas
# Note: I tried 3-8s and I got blocked.
# 5-15s: ~175 searches before block
# 5-30s: 
sleep_second_min = 5
sleep_second_max = 10

In [4]:
def check_if_citations_df_has_existing_data(citationsdfpath):
    '''Checks if there is a citations_df file, and whether it's non-empty'''
    if os.path.exists(citationsdfpath):
        citations_df = pd.read_csv(citationsdfpath)
        if citations_df.shape[0] > 0:
            return True
    
    return False

def check_start_index(startindexpath):
    '''Checks if there is a start_index file, and whether it's non-empty.
    If it's empty, return 0, the default start_index. Else, return the start_index
    in the file.'''
    if os.path.exists(startindexpath):
        start_index_df = pd.read_csv(startindexpath, header=None)
        if start_index_df.shape[0] > 0:
            start_index = start_index_df.iloc[0, 0]
            return start_index
        
    # If there is no saved record of start_index, we assume that it's 0
    return 0

In [5]:
def find_css_element(browser, css_element, output=''):
    '''Find the first instance of the css_element in a bigger element, e.g. 
    (but not limited to) browser '''
    if output == 'text':
        return browser.find_element(By.CSS_SELECTOR, css_element).text
    elif output == 'href':
        return browser.find_element(By.CSS_SELECTOR, css_element).get_attribute('href') 
    elif output == 'css':
        return browser.find_element(By.CSS_SELECTOR, css_element)
    else:
        print("Error: find_css_element")

In [6]:
def solve_captcha():
    '''Creates a sound to tell you that you need to solve a captcha. 
    The program will pause for 10 seconds to give you time to do so.'''
    
    # Play sound
    winsound.Beep(Freq, Dur)
    
    # Pause program for 10s - Jupyter
    time.sleep(20)
    
    # Pause program until enter key is pressed - for Python terminal 
    # raw_input('Please, solve the CAPTCHA and press ENTER: ')


def get_url_and_check_website_is_loaded(url, element_id):
    ''' Checks whether the website is loaded properly - whether there are captchas and 
    whether certain elements are present.'''
    attempt = 1
    while attempt < 3:
        print("Attempt", attempt, ":", url)
        try:
            br.get(url)

            # Check if you can find the html path
            html = br.find_element(By.XPATH, '//html').get_attribute('innerHTML').encode('utf-8')
            html_str = html.decode()

            # Check if there are any captchas
            if any(keyword in html_str for keyword in keyword_list):
                solve_captcha()

            # Check if you can find the element_id element
            doc_list = br.find_element(By.CLASS_NAME, element_id)

            return 1
        
        except:
            attempt += 1
            
            # Pause for a variable number of seconds so that Google can't tell you're web scraping
            sleep_seconds = random.uniform(sleep_second_min, sleep_second_max)
            time.sleep(sleep_seconds)
    
    print('THE FOLLOWING URL COULDN\'T BE DOWNLOADED:')
    print(url)
    return 0

In [7]:
def generate_search_term(macropapers_df_col, get_first_word=False):
    
    if get_first_word:
        macropapers_df_col = macropapers_df_col.str.replace(r",.+","", regex=True)
        
    macropapers_df_col = macropapers_df_col.str.replace(r"[^\w\d\s-]"," ", regex=True)
    macropapers_df_col = macropapers_df_col.str.replace(r"[,;+]","", regex=True)
    macropapers_df_col = macropapers_df_col.str.replace(r" +"," ", regex=True)
    # macropapers_df_col = macropapers_df_col.str.replace(r" -"," ", regex=True)
    macropapers_df_col = macropapers_df_col.str.replace(r"^ ","", regex=True)
    macropapers_df_col = macropapers_df_col.str.replace(r" $","", regex=True)
    macropapers_df_col = macropapers_df_col.str.replace(r" ","+", regex=True)
    
    return macropapers_df_col

def get_search_terms(br, hyperlink):
    # Get url and check website is fully loaded
    if get_url_and_check_website_is_loaded(hyperlink, 'gs_rt') == 0:
        winsound.Beep(Freq, Dur)
        raise ValueError("Website failed to load")
        
    # Container where all needed data is located
    search_result = find_css_element(br, '.gs_ri', 'css')

    # Obtain individual search terms
    title = find_css_element(search_result, '.gs_rt', 'text')
    title_link = find_css_element(search_result, '.gs_rt a', 'href')
    publication_info = find_css_element(search_result, '.gs_a', 'text')
    cited_by_link = find_css_element(search_result, '#gs_res_ccl_mid .gs_nph+ a', 'href')
    number_of_citations = find_css_element(search_result, '.gs_scl:nth-child(1) .gs_or_btn.gs_nph+ a', 'text').replace("Cited by ","")
    
    return search_result, title, title_link, publication_info, cited_by_link, number_of_citations

In [8]:
# Read in macro_papers.xlsx, the excel file containing a df of macro papers
macropapers_df = pd.read_excel(Path(macropaperspath))

# Remove symbols like : , . so that the google scholar search hyperlink can be easily generated
# Then replace spaces with + to imitate the variable part of the hyperlink
macropapers_df['Title_Search'] = generate_search_term(macropapers_df['Title'])
macropapers_df['Author_Search'] = generate_search_term(macropapers_df['Author'])
macropapers_df['Author_Search_First_Word'] = generate_search_term(macropapers_df['Author'], True)
macropapers_df['Publication_Title_Search'] = generate_search_term(macropapers_df['Publication Title'])

# Generate the google search hyperlink
macropapers_df['search_query_full'] = macropapers_df['Title_Search'] + "+" + macropapers_df['Author_Search'] + "+" + macropapers_df['Publication_Title_Search'] + "+" + macropapers_df['Publication Year'].astype(str)
macropapers_df['search_query_title_and_author'] = macropapers_df['Title_Search'] + "+" + macropapers_df['Author_Search']

macropapers_df['hyperlink_full'] = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=" + macropapers_df['search_query_full'] + "&btnG="
macropapers_df['hyperlink_title_and_author'] = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=" + macropapers_df['search_query_title_and_author'] + "&btnG="
macropapers_df['hyperlink_title'] = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=" + macropapers_df['Title_Search'] + "&btnG="
macropapers_df['hyperlink_title_and_author_advanced_search'] = "https://scholar.google.com/scholar?as_q=" + macropapers_df['Title_Search'] + "&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=" + macropapers_df['Author_Search_First_Word'] + "&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14"

In [22]:
# Open Firefox/Tor instance - geckodriver executable must be downloaded (see link above) and placed in this folder
# For Tor, open an instance of the browser first
service_path = r"C:\Program Files (x86)\Tor Browser\Browser\firefox.exe"
binary = FirefoxBinary(service_path)
br = webdriver.Firefox(firefox_binary=binary)

# If just firefox:
# br = webdriver.Firefox()

# Open Firefox instance - geckodriver executable must be downloaded (see link above) and placed in this folder
#firefox_profile = webdriver.FirefoxProfile()
#firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
#br = webdriver.Firefox(firefox_profile=firefox_profile)

# Previous Attempts

# OG1 - Firefox
#br = webdriver.Firefox()

# OG2 - Firefox
#firefox_profile = webdriver.FirefoxProfile()
#firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
#br = webdriver.Firefox(firefox_profile=firefox_profile)

# Trying to do Tor

# profile_path = r"C:\Program Files (x86)\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default"
# service_path = r"C:\Program Files (x86)\Tor Browser\Browser\firefox.exe"

#options = Options()
#options.set_preference('profile', profile_path)
#service = Service(service_path)

# br = webdriver.Firefox(service=service, options=options)

# br = webdriver.Firefox()

# service = Service(service_path)
#options = Options()
#options.set_preference('profile', profile_path)
#options.set_preference('network.proxy.type', 5)
#options.set_preference('network.proxy.socks', '127.0.0.1')
#options.set_preference('network.proxy.socks_port', 9150)
#options.set_preference("debuggerAddress", "127.0.0.1:9222")
#driver = webdriver.Firefox(service=service, options=options)


#br.profile.set_preference('network.proxy.type', 1)
#br.profile.set_preference('network.proxy.socks', '127.0.0.1')
#br.profile.set_preference('network.proxy.socks_port', 9051)


  br = webdriver.Firefox(firefox_binary=binary)


In [23]:
# Wait for Tor to connect automatically - you need to tick the "connect automatically" checkbox first though
br.get('http://scholar.google.com')

# Main loop. It iterates through the list of URLs,
# and downloads all the content in HTML format.
# It also checks if there are more than one page
# of results for each journal, and downloads all the
# pages.

# Initialize df
df_compiled = pd.DataFrame()

start_index = check_start_index(startindexpath)
print("start_index:", start_index)
print("number of rows in macro_papers:", macropapers_df.shape[0])
    
for index, row in macropapers_df.iloc[start_index:].iterrows():
    print("index:", index)
    
    # Pause for a variable number of seconds so that Google can't tell you're web scraping
    sleep_seconds = random.uniform(sleep_second_min, sleep_second_max)
    print("Sleep:", sleep_seconds)
    time.sleep(sleep_seconds)
    
    # Save new start index
    start_index_df = pd.DataFrame([index])
    start_index_df.to_csv(startindexpath, header=None, index=None, mode='w')

    # Initialize df with original columns from macro_papers
    df = row.to_frame().T
        
    try:
        # Get hyperlink
        hyperlink = row['hyperlink_title_and_author_advanced_search']
        
        # Obtain individual search terms
        search_result, title, title_link, publication_info, cited_by_link, number_of_citations = get_search_terms(br, hyperlink)
        
        # Record successful query attempt
        query_attempt = 'title_and_author_advanced_search'
        
    except:
        
        try:
            print("Warning: Previous search query failed. Now trying search query = title and author (simple search).")
            hyperlink = macropapers_df['hyperlink_title_and_author'].iloc[index]

            # Obtain individual search terms
            search_result, title, title_link, publication_info, cited_by_link, number_of_citations = get_search_terms(br, hyperlink)
            
            # Record successful query attempt
            query_attempt = 'title_and_author'
            
        except:
            print("Warning: Previous search query failed. Now trying search query = title only.")
            hyperlink = macropapers_df['hyperlink_title'].iloc[index]

            # Obtain individual search terms
            search_result, title, title_link, publication_info, cited_by_link, number_of_citations = get_search_terms(br, hyperlink)
            
            # Record successful query attempt
            query_attempt = 'title'
            
    # Save search results as new columns under df
    df['title'] = title
    df['title_link'] = title_link
    df['publication_info'] = publication_info
    df['cited_by'] = cited_by_link
    df['number_of_citations'] = number_of_citations
    
    # Save the query attempt that successfully returned results as new column under df
    df['search_attempt'] = query_attempt
    
    # Append df (for this paper) to df_compiled
    df_compiled = df_compiled.append(df)

    
    
br.close()

print("Done!")

start_index: 1171
number of rows in macro_papers: 1229
index: 1171
Sleep: 8.04221666196801
Attempt 1 : https://scholar.google.com/scholar?as_q=Ability+Peer+Effects+in+University+Evidence+from+a+Randomized+Experiment&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=BOOIJ&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1172
Sleep: 9.950258474137392
Attempt 1 : https://scholar.google.com/scholar?as_q=Old+Sick+Alone+and+Poor+A+Welfare+Analysis+of+Old-Age+Social+Insurance+Programmes&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=BRAUN&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1173
Sleep: 5.450464315139572
Attempt 1 : https://scholar.google.com/scholar?as_q=The+Industrialization+and+Economic+Development+of+Russia+through+the+Lens+of+a+Neoclassical+Growth+Model&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=CHEREMUKHIN&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1174
Sleep: 7.095138931430735
Attempt 1 : https://scholar.google.com/scholar?as_q=Optimal+Voti

index: 1200
Sleep: 9.422621862333616
Attempt 1 : https://scholar.google.com/scholar?as_q=Monetary+Shocks+in+Models+with+Inattentive+Producers&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=ALVAREZ&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1201
Sleep: 5.129775919837821
Attempt 1 : https://scholar.google.com/scholar?as_q=Relationships+and+Growth+On+the+Dynamic+Interplay+between+Relational+Contracts+and+Competitive+Markets+in+Economic+Development&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=SHINGO+ISHIGURO&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1202
Sleep: 8.184054307747829
Attempt 1 : https://scholar.google.com/scholar?as_q=Bailouts+and+Financial+Fragility&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=KEISTER&as_publication=&as_ylo=&as_yhi=&hl=en&as_sdt=0%2C14
index: 1203
Sleep: 7.299887925383426
Attempt 1 : https://scholar.google.com/scholar?as_q=Efficient+Coordination+in+Weakest-Link+Games&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=RIEDL&as_publi

Done!


In [24]:
# Save the column as txt file
# macropapers_df['hyperlink'].to_csv(txtoutputfilepath, header=None, index=None, sep=' ', mode='a')
if check_if_citations_df_has_existing_data(citationsdfpath):
    print("citations_df already exists, appending current df to existing citations_df")
    citations_df = pd.read_csv(citationsdfpath)
    print("previous shape:", citations_df.shape)
    citations_df = citations_df.append(df_compiled)
    print("current shape:", citations_df.shape)
else:
    print("citations_df doesn't yet exist, saving current df directly as new citations_df")
    print("previous shape: (0, 0)")
    print("current shape:", df_compiled.shape)
    citations_df = df_compiled

citations_df.to_csv(citationsdfpath, index=None, mode='w')

citations_df already exists, appending current df to existing citations_df
previous shape: (1171, 29)
current shape: (1229, 29)


## Tests

In [67]:
br = webdriver.Firefox()
br.get('http://scholar.google.com')
checkload(c_url, 'gs_rt')

https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=Sectoral+Media+Focus+and+Aggregate+Fluctuations&btnG=
Trying again https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=Sectoral+Media+Focus+and+Aggregate+Fluctuations&btnG=
https://scholar.google.com/scholar?hl=en&as_sdt=0%2C14&q=Sectoral+Media+Focus+and+Aggregate+Fluctuations&btnG=


b'<head><title>Sectoral Media Focus and Aggregate Fluctuations - Google Scholar</title><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="referrer" content="origin-when-cross-origin"><meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2"><meta name="format-detection" content="telephone=no"><link rel="shortcut icon" href="/favicon.ico"><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}html,body{height:100%}#gs_top{position:relative;box-sizing:border-box;min-height:100%;min-width:964px;-webkit-tap-highlight-color:rgba(0,0,0,0);}#gs_top>*:not(#x){-webkit-tap-highlight-color:rgba(204,204,204,.5);}.gs_el_ph #gs_top,.gs_el_ta #gs_top{min-width:320px;}#gs_top.gs_nscl{position:fixed;width:100%;}body,td,input,button{font-size:13px;font-family:Arial,sans-serif;li

In [76]:
br.find_element(By.CSS_SELECTOR, '.gs_ri').text

'Sectoral media focus and aggregate fluctuations\nR Chahrour, K Nimark, S Pitschner - American Economic Review, 2021 - aeaweb.org\nWe formalize the editorial role of news media in a multisector economy and show that media\ncan be an independent source of business cycle fluctuations, even when they report\naccurate information. Public reporting about a subset of sectoral developments that are\nnewsworthy but unrepresentative causes firms across all sectors to hire too much or too little\nlabor. We construct historical measures of US sectoral news coverage and use them to\ncalibrate our model. Time-varying media focus generates demand-like fluctuations that are …\nSave Cite Cited by 24 Related articles All 11 versions'

In [None]:
title = result.select_one('.gs_rt').text
    title_link = result.select_one('.gs_rt a')['href']
    publication_info = result.select_one('.gs_a').text
    snippet = result.select_one('.gs_rs').text
    cited_by_link = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
    number_of_citations = result.select_one('#gs_res_ccl_mid .gs_nph+ a').text.replace("Cited by ","")
    related_articles = result.select_one('a:nth-child(4)')['href']

In [85]:
br.find_element(By.CSS_SELECTOR, '.gs_rt a').get_attribute('href') 

'https://www.aeaweb.org/doi/10.1257/aer.20191895'

In [97]:
# Container where all needed data is located
search_result = find_css_element(br, '.gs_ri', 'css')

# Obtain individual search terms
title = find_css_element(search_result, '.gs_rt', 'text')
title_link = find_css_element(search_result, '.gs_rt a', 'href')
publication_info = find_css_element(search_result, '.gs_a', 'text')
snippet = find_css_element(search_result, '.gs_rs', 'text')
cited_by_link = find_css_element(search_result, '#gs_res_ccl_mid .gs_nph+ a', 'href')
number_of_citations = find_css_element(search_result, '#gs_res_ccl_mid .gs_nph+ a', 'text').replace("Cited by ","")
related_articles = find_css_element(search_result, 'a:nth-child(4)', 'href')

df = df.append({
        'title': title,
        'title_link': title_link,
        'publication_info': publication_info,
        'snippet': snippet,
        'cited_by': f'https://scholar.google.com{cited_by_link}',
        'number_of_citations': number_of_citations,    
        'related_articles': f'https://scholar.google.com{related_articles}',
        'all_article_versions': f'https://scholar.google.com{all_article_versions}'}, ignore_index = True)