In [65]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time

In [82]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    time.sleep(5) # prevents captcha hopefully

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [117]:
def get_author_profile(author_name):
    
    # get HTML response for author
    author_name = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    query = author_name.replace(" ", "-")
    response = get_source("https://www.researchgate.net/profile/" + query)
    soup = BeautifulSoup(response.html.raw_html)
        
    return soup

In [68]:
def get_all_publications(author_profile):
    return author_profile.find_all("div", class_="nova-legacy-v-publication-item nova-legacy-v-publication-item--size-m gtm-research-item")

In [122]:
def get_publisher(publication_link, publication_type):
    
    publication_profile = BeautifulSoup(get_source(publication_link).html.raw_html)
    
    metadata = publication_profile.find("div", class_="research-detail-header-section__metadata")

    publisher = None
    
    if not metadata: # don't search for publisher if no metadata
        pass
    
    elif publication_type == 'Article':

        if metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated"):
            publisher = metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated").text

    elif publication_type in ['Book', 'Chapter']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if "Publisher" in meta.text:
                    publisher = meta.text.split("Publisher: ",1)[1]

    elif publication_type in ['Poster', 'Conference Paper']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if "Conference" in meta.text:
                    publisher = meta.text.split("Conference: ",1)[1]
                    
    return publisher

In [119]:
def get_publication_info(publication):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    publication_type = ""
    publisher = ""
    
    # publication title
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title"):
        publication_title = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title").string

    # publication year
    if publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item"):
        publication_year_string = publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item").string
        publication_year = re.search(r"([0-9]{4})", publication_year_string).group(1) # only get year
    
    # authors
    if publication.find("span", class_="nova-legacy-v-person-inline-item__fullname"):
        publication_author_list = publication.find_all("span", class_="nova-legacy-v-person-inline-item__fullname")
        authors_list = [x.text for x in publication_author_list]
    
    # publication type
    if publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge"):
        publication_type = publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge").text
    
    # publisher
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
    .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare"):

        publication_link = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
        .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare")['href']
        publisher = get_publisher(publication_link, publication_type)
    
    return publication_title, publication_year, authors_list, publication_type, publisher

In [112]:
def get_dataframe_with_publications(author_name):
    author_profile = get_author_profile(author_name)
    
    publication_titles = []
    publication_years = []
    authors_lists = []
    publication_types = []
    publisher_list = []
    author_name_list = []

    
    if author_profile:
        all_publication_entries = get_all_publications(author_profile)
        
    for publication_entry in all_publication_entries:
        publication_title, publication_year, authors_list, publication_type, publisher = get_publication_info(publication_entry)

        publication_titles.append(publication_title)
        publication_years.append(publication_year)
        authors_lists.append(authors_list)
        publication_types.append(publication_type)
        author_name_list.append(author_name)
        publisher_list.append(publisher)
    
    
    data = {'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years, 
            'PUB_AUTHORS': authors_lists,  
            'PUB_TYPE': publication_types,
            'PUB_PUBLISHER': publisher_list,
            'AUTHOR_NAME': author_name_list}
    
    return pd.DataFrame(data)

In [121]:
get_dataframe_with_publications("Stephan Daurer")

Unnamed: 0,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,AUTHOR_NAME
0,Einführung in die Wirtschaftsinformatik: Ein f...,2021,"[Michael A. Bächle, Stephan Daurer, Arthur Kolb]",Book,De Gruyter Oldenbourg,Stephan Daurer
1,Do‐It‐Yourself as a Means for Making Assistive...,2021,"[Tobias Mettler, Stephan Daurer, Michael A. Bä...",Article,Information Systems Journal,Stephan Daurer
2,Zentrum für Digitale Innovationen,2020,"[Stephan Daurer, Gerhard Hellstern, Petra Radke]",Poster,"DHBW Forschungstag 2020, Mosbach",Stephan Daurer
3,Die Aufklärung und das Web 2.0,2019,"[Michael A. Bächle, Stephan Daurer]",Article,Wirtschaftsinformatik & Management,Stephan Daurer
4,Application of Media Synchronicity Theory to C...,2019,"[Lukas Furmanek, Stephan Daurer]",Conference Paper,14. International Conference on Wirtschaftsinf...,Stephan Daurer
5,Einführung in die Wirtschaftsinformatik: Ein f...,2018,"[Michael A. Bächle, Stephan Daurer, Arthur Kolb]",Book,De Gruyter Oldenbourg,Stephan Daurer
6,Chatbots as a User Interface for Assistive Tec...,2018,"[Michael Baechle, Stephan Daurer, Andreas Judt...",Conference Paper,Usability Day XVI,Stephan Daurer
7,Assistive Technology for Independent Living wi...,2018,"[Michael Bächle, Stephan Daurer, Andreas Judt,...",Article,Health Policy and Technology,Stephan Daurer
8,Parental control reversed: Using ADR for desig...,2017,"[Tobias Mettler, Michael Bächle, Stephan Daure...",Conference Paper,Thirty Eighth International Conference on Info...,Stephan Daurer
9,iCare-Do-It-Yourself Architektur ambienter Ass...,2017,"[Andreas Judt, Michael Bächle, Stephan Daurer,...",Conference Paper,Smart-Future-Living-Bodensee,Stephan Daurer
