In [18]:
#%pip install requests_html

In [19]:
# change to store dataset
STORE_PUB_GS = True

# change for different location

# locations: ravensburg, mannheim, heidenheim, karlsruhe, campus-horb, stuttgart
# locations: heilbronn, loerrach, mosbach, villingen-schwenningen

location = "ravensburg"

In [20]:
import requests
import urllib
from pathlib import Path
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time
import datetime
import random

In [21]:
# get current year to save in applicable folder
current_year = datetime.date.today().year

# open session for html requests
SESSION = HTMLSession()

In [22]:
# get author names
pd_employees = pd.read_csv(f'../data/{current_year}/employees_{location}.csv')
author_names = pd_employees['employee_name_clean'].tolist()

In [23]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    random_wait_time = random.randint(1,3)
    time.sleep(random_wait_time)

    try:
        response = SESSION.get(url)
        
        if response.status_code == 429:
            raise requests.exceptions.RequestException(f"Too many requests have been made. Stopped at url: {url}")
        
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [24]:
def get_author_id_query(soup):
    
    if soup.find("h4", class_ = "gs_rt2"): # check if is profile
        # get user ID for query
        query_with_id = soup.find("h4", class_ = "gs_rt2").find("a").get('href')
        
    else:
        query_with_id = None
        print(f"{author_name} does not have a profile on Google Scholar.")
        
    return query_with_id

In [25]:
def has_profile(soup):   
    if soup.find("h4", class_ = "gs_rt2"): # check if is profile
        return True
    else:
        return False

In [26]:
def remove_numbers(string_with_numbers):
    if not string_with_numbers:
        return None
    
    tokens = string_with_numbers.split(" ")
    cleaned_tokens = []
    for token in tokens:
        if not re.search("[0-9]", token):
            cleaned_tokens.append(token)
    return " ".join(cleaned_tokens) 

In [27]:
def get_all_publication_entries(soup):
    
    query_with_id = get_author_id_query(soup)
    soup = None

    if query_with_id != None:
        # get HTML response for author profile
        response = get_source(f"https://scholar.google.com{query_with_id}&cstart=0&pagesize=1001") # the last part is to get all entries

        if response:
            soup = BeautifulSoup(response.html.raw_html)

            #get article entries
            all_article_entries = soup.find_all("tr", class_ = "gsc_a_tr")
        
    else:
        all_article_entries = None
        
    return all_article_entries

In [28]:
def get_publication_info(entry):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    number_citations = ""
    
    # get authors and journal
    if entry.find("div", class_ = "gs_gray"):
        string_with_authors = entry.find("div", class_ = "gs_gray").string
        authors_list = string_with_authors.split(",")
        
        publisher_string = entry.find_all("div", class_ = "gs_gray")[1].text
        publisher = publisher_string.split(",")[0] #also has ISBN
        publisher = remove_numbers(publisher)

    # get title
    if entry.find("a", class_="gsc_a_at"):
        publication_title = entry.find("a", class_="gsc_a_at").string

    # get year
    if entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl"):
        publication_year = entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl").string

    # get number of citations
    if entry.find("a", class_ = "gsc_a_ac gs_ibl"):
        number_citations = entry.find("a", class_ = "gsc_a_ac gs_ibl").string
        
    
    
    return publication_title, publication_year, authors_list, number_citations, publisher

In [29]:
def get_dataframe_with_publications(soup, author_name):
    
    publication_titles = []
    publication_years = []
    publishers = []
    authors_lists = []
    number_citations_list = []
    author_name_list = []
    updated_list = []
    
    all_publication_entries = get_all_publication_entries(soup)
    
    if all_publication_entries:
        
        for publication_entry in all_publication_entries:
            publication_title, publication_year, authors_list, number_citations, publisher = get_publication_info(publication_entry)

            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            publishers.append(publisher)
            authors_lists.append(authors_list)
            number_citations_list.append(number_citations)
            author_name_list.append(author_name)
            updated_list.append(datetime.date.today())


    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years,
            'PUB_PUBLISHER': publishers,
            'PUB_AUTHORS': authors_lists, 
            'PUB_CITATIONS': number_citations_list, 
            'AUTHOR_NAME': author_name_list,
            'UPDATED': updated_list}
    
    return pd.DataFrame(data)

In [30]:
def get_dataframe_with_publications_no_profile(soup, author_name):

    publication_titles = []
    publication_years = []
    publishers = []
    authors_lists = []
    number_citations_list = []
    author_name_list = []
    updated_list = []
    
    number_publications = 1
    
    if not soup or not soup.find("div", id="gs_ab_md"):
        return pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 'PUB_AUTHORS', 'PUB_PUBLISHER', 'PUB_CITATIONS', 'UPDATED'])

    search_result = soup.find("div", id="gs_ab_md").find("div", class_="gs_ab_mdw").text
    
    if re.search(" [0-9]* result", search_result):
        number_publications = int(re.search(" [0-9]* result", search_result).group().strip().split(" ")[0])
        
    # first page
    
    for entry in soup.find_all("div", class_="gs_ri"):

        # and journal
        if entry.find("div", class_="gs_a"):
            # get authors
            authors_list = entry.find("div", class_="gs_a").text.split("-")[0].replace("\xa0", "").split(",")

            # get year
            if re.search("[0-9]{4}", entry.find("div", class_="gs_a").text):
                publication_year = re.search("[0-9]{4}", entry.find("div", class_="gs_a").text).group()
            else:
                publication_year = None

            # get publisher
            journal_string = entry.find("div", class_="gs_a").text.split("-")[1]
            if "," in journal_string:
                publisher = journal_string.split(",")[0]
            else:
                publisher = None
                
            publisher = remove_numbers(publisher)

        # get title
        if entry.find("h3", class_="gs_rt"):
            publication_title = entry.find("h3", class_="gs_rt").text.replace("[PDF]", "").strip()

        # get number of citations
        footer_elements = entry.find("div", class_="gs_fl").find_all("a")
        number_citations = None

        for element in footer_elements:
            if "Cited by" in element.text:
                number_citations = int(re.search("[0-9]*$", element.text).group())


        publication_titles.append(publication_title)
        publication_years.append(publication_year)
        publishers.append(publisher)
        authors_lists.append(authors_list)
        number_citations_list.append(number_citations)
        author_name_list.append(author_name)
        updated_list.append(datetime.date.today())

        

    # all other pages
    for current_page in range(10, number_publications, 10):
        next_page = get_source(f"https://scholar.google.cz/scholar?start={current_page}&hl=en&as_sdt=0%2C5&as_vis=1&q=author%3A%22{author_name_string}%22&btnG=")
        soup = BeautifulSoup(next_page.html.raw_html)

        for entry in soup.find_all("div", class_="gs_ri"):

            # and journal
            if entry.find("div", class_="gs_a"):
                # get authors
                authors_list = entry.find("div", class_="gs_a").text.split("-")[0].replace("\xa0", "").split(",")

                # get year
                if re.search("[0-9]{4}", entry.find("div", class_="gs_a").text):
                    publication_year = re.search("[0-9]{4}", entry.find("div", class_="gs_a").text).group()
                else:
                    publication_year = None

                # get publisher
                journal_string = entry.find("div", class_="gs_a").text.split("-")[1]
                if "," in journal_string:
                    publisher = journal_string.split(",")[0]
                else:
                    publisher = None
                    
                publisher = remove_numbers(publisher)

            # get title
            if entry.find("h3", class_="gs_rt"):
                publication_title = entry.find("h3", class_="gs_rt").text.replace("[PDF]", "").strip()

            # get number of citations
            footer_elements = entry.find("div", class_="gs_fl").find_all("a")
            number_citations = None
            
            for element in footer_elements:
                if "Cited by" in element.text:
                    number_citations = int(re.search("[0-9]*$", element.text).group())


            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            publishers.append(publisher)
            authors_lists.append(authors_list)
            number_citations_list.append(number_citations)
            author_name_list.append(author_name)
            updated_list.append(datetime.date.today())

    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years,
            'PUB_PUBLISHER': publishers,
            'PUB_AUTHORS': authors_lists, 
            'PUB_CITATIONS': number_citations_list, 
            'AUTHOR_NAME': author_name_list,
            'UPDATED': updated_list}

    return pd.DataFrame(data)

In [31]:
# get the names that are alrady in the database
outcome_file = Path(f'../data/{current_year}/publications_{location}_gs.csv', index=False)

pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 'PUB_AUTHORS', 'PUB_PUBLISHER', 'PUB_CITATIONS', 'UPDATED'])

# don't use names that are recent enough
if outcome_file.is_file():
    pd_result = pd.read_csv(f'../data/{current_year}/publications_{location}_gs.csv')
    authors_already_recent = []
    
    # update authors that have been updated more than 30 days ago
    for author in pd_result["AUTHOR_NAME"].unique():
        if datetime.datetime.strptime(pd_result[pd_result['AUTHOR_NAME'] == author].iloc[0]["UPDATED"], "%Y-%m-%d").date() > (datetime.date.today() - datetime.timedelta(days=60)):
            authors_already_recent.append(author)
    
    author_names = list(set(author_names) - set(authors_already_recent))

In [32]:
# loop through authors and collect publications in dataframe

for author_name in author_names:

    # get HTML response for author
    author_name_string = author_name.replace(" ", "+")
    response = get_source(f"https://scholar.google.cz/scholar?hl=en&as_sdt=0%2C5&as_vis=1&q=author%3A%22{author_name_string}%22")
    soup = BeautifulSoup(response.html.raw_html)
    
    if has_profile(soup):
        pd_publications = get_dataframe_with_publications(soup, author_name)

    else:
        pd_publications = get_dataframe_with_publications_no_profile(soup, author_name)
        
    if pd_publications.empty:
        print(f"No publications found for {author_name}.")
    
    else:
        pd_result = pd_result.append(pd_publications, ignore_index=True, sort=False)

pd_result = pd_result.reset_index(drop=True)

No publications found for Bernd Radtke.
No publications found for Michaela Bergmann.
No publications found for Thomas Mannchen.
No publications found for Conny Mayer-Bonde.
No publications found for Joachim Güntzel.
No publications found for Heike Stahl.
No publications found for Mathias Hassenstein.
No publications found for Harald Pfab.
No publications found for Alexandra Ottler.
No publications found for Karin Reinhard.
No publications found for Udo Klaiber.
No publications found for Markus Schatz.
No publications found for Paul Kirchberg.
No publications found for Markus Rathgeb.
No publications found for Wilhelm Ruckdeschel.
No publications found for Lars Ruhbach.
No publications found for Alexander Dingeldey.
No publications found for Petra Kroflin.
No publications found for Friedrich Then Bergh.
No publications found for Anja Brittner-Widmann.
No publications found for Claudia Lembach.
No publications found for Bhagyaraj Dharmana.
Too many requests have been made. Stopped at url

AttributeError: 'NoneType' object has no attribute 'html'

In [None]:
if STORE_PUB_GS:
    pd_result.to_csv(f'../data/{current_year}/publications_{location}_gs.csv', index=False)

In [35]:
pd_result[pd_result["AUTHOR_NAME"] == "Stephan Daurer"]

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_PUBLISHER,PUB_CITATIONS,UPDATED
586,Stephan Daurer,Einführung in die Wirtschaftsinformatik: Ein f...,2021.0,"['MA Bächle', ' S Daurer', ' A Kolb']",De Gruyter Oldenbourg,41.0,2022-03-19
587,Stephan Daurer,Consumer Search Behavior on the Mobile Interne...,2016.0,"['S Daurer', ' D Molitor', ' M Spann', ' P Man...",Ross School of Business Paper,23.0,2022-03-19
588,Stephan Daurer,Digitalisierung und Konvergenz von Online-und ...,2012.0,"['S Daurer', ' D Molitor', ' M Spann']",Zeitschrift für Betriebswirtschaft (ZfB),22.0,2022-03-19
589,Stephan Daurer,Assistive technology for independent living wi...,2018.0,"['M Bächle', ' S Daurer', ' A Judt', ' T Mettl...",Health Policy and Technology,19.0,2022-03-19
590,Stephan Daurer,Tell Me Where You Are and I’ll Tell You What Y...,2016.0,"['M Spann', ' D Molitor', ' S Daurer']",GfK Marketing Intelligence Review,18.0,2022-03-19
591,Stephan Daurer,Measuring Individual Search Costs on the Mobil...,2012.0,"['S Daurer', ' D Molitor', ' M Spann']",ECIS,14.0,2022-03-19
592,Stephan Daurer,Potenziale integrierter Social Software-das Be...,2006.0,"['M Bächle', ' S Daurer']",HMD-Praxis der Wirtschaftsinformatik,10.0,2022-03-19
593,Stephan Daurer,"The Impact of Smartphones, Barcode Scanning, a...",2013.0,"['S Daurer', ' D Molitor', ' M Spann', ' P Man...",,9.0,2022-03-19
594,Stephan Daurer,Parental control reversed: Using ADR for desig...,2017.0,"['T Mettler', ' M Bächle', ' S Daurer', ' A Ju...",ICIS Proceedings,4.0,2022-03-19
595,Stephan Daurer,Application of Media Synchronicity Theory to C...,2019.0,"['L Furmanek', ' S Daurer']",Proceedings of the International Conference on...,3.0,2022-03-19
