In [72]:
#%pip install requests_html

In [73]:
# change to store dataset
STORE_PUB_GS = False

In [74]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time
import datetime

In [75]:
# get current year to save in applicable folder
current_year = datetime.date.today().year

# open session for html requests
SESSION = HTMLSession()

In [76]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    time.sleep(20)

    try:
        response = SESSION.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [77]:
def get_author_id_query(soup):
    
    if soup.find("h4", class_ = "gs_rt2"): # check if is profile
        # get user ID for query
        query_with_id = soup.find("h4", class_ = "gs_rt2").find("a").get('href')
        
    else:
        query_with_id = None
        print(f"{author_name} does not have a profile on Google Scholar.")
        
    return query_with_id

In [78]:
def has_profile(soup):   
    if soup.find("h4", class_ = "gs_rt2"): # check if is profile
        return True
    else:
        return False

In [79]:
def remove_numbers(string_with_numbers):
    if not string_with_numbers:
        return None
    
    tokens = string_with_numbers.split(" ")
    cleaned_tokens = []
    for token in tokens:
        if not re.search("[0-9]", token):
            cleaned_tokens.append(token)
    return " ".join(cleaned_tokens) 

In [80]:
def get_all_publication_entries(soup):
    
    query_with_id = get_author_id_query(soup)
    soup = None

    if query_with_id != None:
        # get HTML response for author profile
        response = get_source(f"https://scholar.google.com{query_with_id}&cstart=0&pagesize=1001") # the last part is to get all entries

        if response:
            soup = BeautifulSoup(response.html.raw_html)

            #get article entries
            all_article_entries = soup.find_all("tr", class_ = "gsc_a_tr")
        
    else:
        all_article_entries = None
        
    return all_article_entries

In [81]:
def get_publication_info(entry):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    number_citations = ""
    
    # get authors and journal
    if entry.find("div", class_ = "gs_gray"):
        string_with_authors = entry.find("div", class_ = "gs_gray").string
        authors_list = string_with_authors.split(",")
        
        publisher_string = entry.find_all("div", class_ = "gs_gray")[1].text
        publisher = publisher_string.split(",")[0] #also has ISBN
        publisher = remove_numbers(publisher)

    # get title
    if entry.find("a", class_="gsc_a_at"):
        publication_title = entry.find("a", class_="gsc_a_at").string

    # get year
    if entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl"):
        publication_year = entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl").string

    # get number of citations
    if entry.find("a", class_ = "gsc_a_ac gs_ibl"):
        number_citations = entry.find("a", class_ = "gsc_a_ac gs_ibl").string
        
    
    
    return publication_title, publication_year, authors_list, number_citations, publisher

In [82]:
def get_dataframe_with_publications(soup, author_name):
    
    publication_titles = []
    publication_years = []
    publishers = []
    authors_lists = []
    number_citations_list = []
    author_name_list = []
    
    all_publication_entries = get_all_publication_entries(soup)
    
    if all_publication_entries:
        
        for publication_entry in all_publication_entries:
            publication_title, publication_year, authors_list, number_citations, publisher = get_publication_info(publication_entry)

            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            publishers.append(publisher)
            authors_lists.append(authors_list)
            number_citations_list.append(number_citations)
            author_name_list.append(author_name)


    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years,
            'PUB_PUBLISHER': publishers,
            'PUB_AUTHORS': authors_lists, 
            'PUB_CITATIONS': number_citations_list, 
            'AUTHOR_NAME': author_name_list}
    
    return pd.DataFrame(data)

In [83]:
def get_dataframe_with_publications_no_profile(soup, author_name):

    publication_titles = []
    publication_years = []
    publishers = []
    authors_lists = []
    number_citations_list = []
    author_name_list = []
    
    number_publications = 1

    search_result = soup.find("div", id="gs_ab_md").find("div", class_="gs_ab_mdw").text
    
    if re.search(" [0-9]* result", search_result):
        number_publications = int(re.search(" [0-9]* result", search_result).group().strip().split(" ")[0])
        
    # first page
    
    for entry in soup.find_all("div", class_="gs_ri"):

        # and journal
        if entry.find("div", class_="gs_a"):
            # get authors
            authors_list = entry.find("div", class_="gs_a").text.split("-")[0].replace("\xa0", "").split(",")

            # get year
            if re.search("[0-9]{4}", entry.find("div", class_="gs_a").text):
                publication_year = re.search("[0-9]{4}", entry.find("div", class_="gs_a").text).group()
            else:
                publication_year = None

            # get publisher
            journal_string = entry.find("div", class_="gs_a").text.split("-")[1]
            if "," in journal_string:
                publisher = journal_string.split(",")[0]
            else:
                publisher = None
                
            publisher = remove_numbers(publisher)

        # get title
        if entry.find("h3", class_="gs_rt"):
            publication_title = entry.find("h3", class_="gs_rt").text.replace("[PDF]", "").strip()

        # get number of citations
        footer_elements = entry.find("div", class_="gs_fl").find_all("a")
        number_citations = None

        for element in footer_elements:
            if "Cited by" in element.text:
                number_citations = int(re.search("[0-9]*$", element.text).group())


        publication_titles.append(publication_title)
        publication_years.append(publication_year)
        publishers.append(publisher)
        authors_lists.append(authors_list)
        number_citations_list.append(number_citations)
        author_name_list.append(author_name)

        

    # all other pages
    for current_page in range(10, number_publications, 10):
        next_page = get_source(f"https://scholar.google.cz/scholar?start={current_page}&hl=en&as_sdt=0%2C5&as_vis=1&q=author%3A%22{author_name_string}%22&btnG=")
        soup = BeautifulSoup(next_page.html.raw_html)

        for entry in soup.find_all("div", class_="gs_ri"):

            # and journal
            if entry.find("div", class_="gs_a"):
                # get authors
                authors_list = entry.find("div", class_="gs_a").text.split("-")[0].replace("\xa0", "").split(",")

                # get year
                if re.search("[0-9]{4}", entry.find("div", class_="gs_a").text):
                    publication_year = re.search("[0-9]{4}", entry.find("div", class_="gs_a").text).group()
                else:
                    publication_year = None

                # get publisher
                journal_string = entry.find("div", class_="gs_a").text.split("-")[1]
                if "," in journal_string:
                    publisher = journal_string.split(",")[0]
                else:
                    publisher = None
                    
                publisher = remove_numbers(publisher)

            # get title
            if entry.find("h3", class_="gs_rt"):
                publication_title = entry.find("h3", class_="gs_rt").text.replace("[PDF]", "").strip()

            # get number of citations
            footer_elements = entry.find("div", class_="gs_fl").find_all("a")
            number_citations = None
            
            for element in footer_elements:
                if "Cited by" in element.text:
                    number_citations = int(re.search("[0-9]*$", element.text).group())


            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            publishers.append(publisher)
            authors_lists.append(authors_list)
            number_citations_list.append(number_citations)
            author_name_list.append(author_name)

    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years,
            'PUB_PUBLISHER': publishers,
            'PUB_AUTHORS': authors_lists, 
            'PUB_CITATIONS': number_citations_list, 
            'AUTHOR_NAME': author_name_list}

    return pd.DataFrame(data)

In [84]:
# get HTML response for author
author_names = ["Stephan Daurer", "Gerhard Hellstern", "Michael Bächle", "Thomas Asche", "Oliver Bährle", "Wolfgang Bihler"]

pd_result = pd.DataFrame(columns=['PUB_TITLE', 'PUB_YEAR', 'PUB_AUTHORS', 'PUB_PUBLISHER', 'AUTHOR_NAME'])

for author_name in author_names:
    
    # get HTML response for author
    author_name_string = author_name.replace(" ", "+")
    response = get_source(f"https://scholar.google.cz/scholar?hl=en&as_sdt=0%2C5&as_vis=1&q=author%3A%22{author_name_string}%22&btnG=")
    soup = BeautifulSoup(response.html.raw_html)
    
    if has_profile(soup):
        pd_publications = get_dataframe_with_publications(soup, author_name)

    else:
        pd_publications = get_dataframe_with_publications_no_profile(soup, author_name)
        
    if pd_publications.empty:
        print(f"No publications found for {author_name}.")
    
    else:
        pd_result = pd_result.append(pd_publications)

pd_result = pd_result.reset_index(drop=True)

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
pd_result = pd_result.reset_index(drop=True)
pd_result

In [None]:
if STORE_PUB_GS:
    pd_result.to_csv(f'../data/{current_year}/publications_test_gs.csv')