In [4]:
#%pip install requests_html

In [1]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time



In [2]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    time.sleep(3)

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [4]:
def get_author_id_query(author_name):
    
    # get HTML response for author
    query = urllib.parse.quote_plus(author_name)
    response = get_source("https://scholar.google.com/scholar?q=" + query)
    soup = BeautifulSoup(response.html.raw_html)
    
    if soup.find("h4", class_ = "gs_rt2"): # check if is profile
        # get user ID for query
        query_with_id = soup.find("h4", class_ = "gs_rt2").find("a").get('href')
        
    else:
        query_with_id = None
        print(f"{author_name} does not have a profile on Google Scholar.")
        
    return query_with_id

In [5]:
def get_author_profile(author_name):
    
    query_with_id = get_author_id_query(author_name)
    
    # get HTML response for author profile
    response = get_source(f"https://scholar.google.com{query_with_id}&cstart=0&pagesize=1001") # the last part is to get all entries
    
    if response:
        soup = BeautifulSoup(response.html.raw_html)
    else:
        soup = None
    
    return soup

In [6]:
def get_all_publication_entries(author_name):
    
    soup = get_author_profile(author_name)
    
    if soup:
        #get article entries
        all_article_entries = soup.find_all("tr", class_ = "gsc_a_tr")
        
    else:
        all_article_entries = None
        
    return all_article_entries

In [7]:
def get_publication_info(entry):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    number_citations = ""
    
    # get authors and journal
    if entry.find("div", class_ = "gs_gray"):
        string_with_authors = entry.find("div", class_ = "gs_gray").string
        authors_list = string_with_authors.split(",")
        
        publisher_string = entry.find_all("div", class_ = "gs_gray")[1].text
        publisher = publisher_string.split(",")[0] #also has ISBN

    # get title
    if entry.find("a", class_="gsc_a_at"):
        publication_title = entry.find("a", class_="gsc_a_at").string

    # get year
    if entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl"):
        publication_year = entry.find("span", class_ = "gsc_a_h gsc_a_hc gs_ibl").string

    # get number of citations
    if entry.find("a", class_ = "gsc_a_ac gs_ibl"):
        number_citations = entry.find("a", class_ = "gsc_a_ac gs_ibl").string
        
    
    
    return publication_title, publication_year, authors_list, number_citations, publisher

In [8]:
def get_dataframe_with_publications(author_name):
    
    publication_titles = []
    publication_years = []
    publishers = []
    authors_lists = []
    number_citations_list = []
    author_name_list = []
    
    all_publication_entries = get_all_publication_entries(author_name)
    
    if all_publication_entries:
        
        for publication_entry in all_publication_entries:
            publication_title, publication_year, authors_list, number_citations, publisher = get_publication_info(publication_entry)

            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            publishers.append(publisher)
            authors_lists.append(authors_list)
            number_citations_list.append(number_citations)
            author_name_list.append(author_name)


    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years,
            'PUB_PUBLISHER': publishers,
            'PUB_AUTHORS': authors_lists, 
            'PUB_CITATIONS': number_citations_list, 
            'AUTHOR_NAME': author_name_list}
    
    return pd.DataFrame(data)

In [11]:
get_dataframe_with_publications("Stephan Daurer")

Unnamed: 0,PUB_TITLE,PUB_YEAR,PUB_PUBLISHER,PUB_AUTHORS,PUB_CITATIONS,AUTHOR_NAME
0,Einführung in die Wirtschaftsinformatik: Ein f...,2021,De Gruyter Oldenbourg,"[MA Bächle, S Daurer, A Kolb]",38.0,Stephan Daurer
1,Digitalisierung und Konvergenz von Online-und ...,2012,Zeitschrift für Betriebswirtschaft (ZfB) 82 (4),"[S Daurer, D Molitor, M Spann]",22.0,Stephan Daurer
2,Consumer Search Behavior on the Mobile Interne...,2016,Ross School of Business Paper,"[S Daurer, D Molitor, M Spann, P Manchanda]",20.0,Stephan Daurer
3,Assistive technology for independent living wi...,2018,Health Policy and Technology 7 (1),"[M Bächle, S Daurer, A Judt, T Mettler]",19.0,Stephan Daurer
4,Tell Me Where You Are and I’ll Tell You What Y...,2016,GfK Marketing Intelligence Review 8 (2),"[M Spann, D Molitor, S Daurer]",14.0,Stephan Daurer
5,Measuring Individual Search Costs on the Mobil...,2012,ECIS,"[S Daurer, D Molitor, M Spann]",13.0,Stephan Daurer
6,Potenziale integrierter Social Software-das Be...,2006,HMD-Praxis der Wirtschaftsinformatik 252,"[M Bächle, S Daurer]",11.0,Stephan Daurer
7,"The Impact of Smartphones, Barcode Scanning, a...",2013,,"[S Daurer, D Molitor, M Spann, P Manchanda]",9.0,Stephan Daurer
8,Parental control reversed: Using ADR for desig...,2017,ICIS 2017 Proceedings,"[T Mettler, M Bächle, S Daurer, A Judt]",4.0,Stephan Daurer
9,Consumer Preferences for Product Information a...,2017,Proceedings of the 13th International Conferen...,"[J Fölting, S Daurer, M Spann]",4.0,Stephan Daurer
