In [225]:
import requests
import pandas as pd
import time
import re
import sys
from bs4 import BeautifulSoup

papers = pd.DataFrame(columns=["journal", "title", "authors", "year", "citations", "url"])

def gen_url(start_num, journal, from_year, to_year):
    return "https://scholar.google.com/scholar?start=" + str(start_num) + "&q=%22Machine+Learning%22+source:%22" + "+".join(journal.split()) + "%22&hl=de&as_sdt=0,5&as_ylo=" + str(from_year) + "&as_yhi=" + str(to_year)

def iterate_over_results(url, journal, papers):
    start_num = 0
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    print("##################################################")
    print("##### Current Journal: " + journal + " #####")
    print("##################################################")
    while True:
        if "captcha" in str(soup):
            print("Google Scholar temporarily blocked you for crawling papers... Try again later...")
            sys.exit()
        
        print("URL of current page and journal: " + url)
        print("Index in current journal: " + str(start_num))
        for article in soup.find_all("div", class_="gs_r gs_or gs_scl"):
            title = article.find("h3", class_="gs_rt").get_text()
            author_year_element = article.find("div", class_="gs_a")
            year = re.findall(r"\d{4}", author_year_element.get_text()) if author_year_element is not None else ""
            authors = author_year_element.get_text().replace(u'\xa0', u' ').split(" - ")[0] if author_year_element is not None else ""
            cites_element = article.select_one("a[href*='/scholar?cites=']")
            cites = re.findall(r"\d.*", cites_element.get_text())[0] if cites_element is not None else "0"
            url = article.select_one("h3", class_="gs_rt").a["href"]
            papers = papers.append({'journal': journal, 'title': title, 'authors': authors, 'year': year[0] if len(year) > 0 else "", 'citations': cites, 'url': url}, ignore_index=True)
            
        if len(soup.select("td[align='left'] > a > b")) == 0:
            break
        time.sleep(1)
        start_num += 10
        url = gen_url(start_num, journal, 2016, 2021)
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

    print("Number of articles in current journal: " + str(len(papers[papers["journal"] == journal])))
    return papers
 
journals = ["Political Science Research and Methods", "Political Analysis", "American Political Science Review"]
for journal in journals:
    url = gen_url(0, journal, 2016, 2021)
    papers = iterate_over_results(url, journal, papers)

papers.sort_values(["journal", "year", "title"]).to_csv("papers.csv", index=False)

##################################################
##### Current Journal: Political Science Research and Methods #####
##################################################
URL of current page and journal: https://scholar.google.com/scholar?start=0&q=%22Machine+Learning%22+source:%22Political+Science+Research+and+Methods%22&hl=de&as_sdt=0,5&as_ylo=2016&as_yhi=2021
Index in current journal: 0
URL of current page and journal: https://scholar.google.com/scholar?start=10&q=%22Machine+Learning%22+source:%22Political+Science+Research+and+Methods%22&hl=de&as_sdt=0,5&as_ylo=2016&as_yhi=2021
Index in current journal: 10
URL of current page and journal: https://scholar.google.com/scholar?start=20&q=%22Machine+Learning%22+source:%22Political+Science+Research+and+Methods%22&hl=de&as_sdt=0,5&as_ylo=2016&as_yhi=2021
Index in current journal: 20
Number of articles in current journal: 29
##################################################
##### Current Journal: Political Analysis #####
###################

In [226]:
papers

Unnamed: 0,journal,title,authors,year,citations,url
0,Political Science Research and Methods,A new geography of civil war: a machine learni...,K Kikuta,2020,3,https://www.cambridge.org/core/journals/politi...
1,Political Science Research and Methods,Point break: using machine learning to uncover...,"KD Funk, HL Paul, AQ Philips",2021,0,https://www.cambridge.org/core/journals/politi...
2,Political Science Research and Methods,We need to go deeper: measuring electoral viol...,"D Muchlinski, X Yang, S Birch…",2021,7,https://www.cambridge.org/core/journals/politi...
3,Political Science Research and Methods,Corpus-based dictionaries for sentiment analys...,"DR Rice, C Zorn",2021,79,https://www.cambridge.org/core/journals/politi...
4,Political Science Research and Methods,Measuring elite personality using speech,"AJ Ramey, JD Klingler, GE Hollibaugh",2019,35,https://www.cambridge.org/core/journals/politi...
...,...,...,...,...,...,...
133,American Political Science Review,How to make causal inferences with time-series...,"M Blackwell, AN Glynn",2018,66,https://www.cambridge.org/core/journals/americ...
134,American Political Science Review,"One person, one vote: Estimating the prevalenc...","S Goel, M Meredith, M Morse, D Rothschild…",2020,31,https://www.cambridge.org/core/journals/americ...
135,American Political Science Review,Democratization and the conditional dynamics o...,"MT Dorsch, P Maarek",2019,28,https://www.cambridge.org/core/journals/americ...
136,American Political Science Review,Effective for Whom? Ethnic Identity and Nonvio...,"D Manekin, T Mitts",2020,0,https://www.cambridge.org/core/journals/americ...
