In [23]:
#!pip install nest_asyncio

In [36]:
# change to store dataset
STORE_PUB_RG = False

In [24]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time
import random
import datetime

In [None]:
# get current year to save in applicable folder
current_year = datetime.date.today().year

# open session for html requests
SESSION = HTMLSession()

In [25]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    random_wait_time = random.randint(20,60)
    time.sleep(random_wait_time) # prevents captcha hopefully

    try:
        response = SESSION.get(url)
        
        if response.status_code == 429:
            raise requests.exceptions.RequestException(f"Too many requests have been made. Stopped at url: {url}")
        
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [26]:
def get_author_profile(author_name):
    
    # get HTML response for author
    author_name = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    query = author_name.replace(" ", "-")
    response = get_source("https://www.researchgate.net/profile/" + query)
    soup = BeautifulSoup(response.html.raw_html)
        
    return soup

In [27]:
def get_author_publication_list(author_name, page):
    
    # get HTML response for author
    author_name = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    query = author_name.replace(" ", "+")
    response = get_source(f"https://www.researchgate.net/search/publication?q=\"{query}\"&page={page}")
    soup = BeautifulSoup(response.html.raw_html)
    
    return soup

In [28]:
def get_all_publications(page):
    return page.find_all("div", class_="nova-legacy-v-publication-item__body")

In [29]:
def get_publisher(publication_profile, publication_type):
    
    metadata = publication_profile.find("div", class_="research-detail-header-section__metadata")

    publisher = None
    
    if not metadata: # don't search for publisher if no metadata
        pass
    
    elif publication_type == 'Article':

        if metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated"):
            publisher = metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated").text

    elif publication_type in ['Book', 'Chapter']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if "Publisher" in meta.text:
                    publisher = meta.text.split("Publisher: ",1)[1]

    elif publication_type in ['Poster', 'Conference Paper']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if "Conference" in meta.text:
                    publisher = meta.text.split("Conference: ",1)[1]
                    
    return publisher

In [30]:
def get_identifier(publication_profile):
    
    metadata = publication_profile.find("div", class_="research-detail-header-section__metadata")

    isbn = None
    doi = None
    
    if not metadata: # don't search for identifier if no metadata
        pass
    

    if metadata.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-xxs nova-legacy-e-text--color-grey-700"):

        # there are multiple items with that class
        for meta in metadata.find_all("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-xxs nova-legacy-e-text--color-grey-700"):
            if "ISBN" in meta.text:
                isbn = meta.text.split("ISBN:",1)[1]
                
            elif "DOI" in meta.text:
                doi = meta.text.split("DOI:",1)[1]
       
    return isbn, doi

In [31]:
def get_publication_info(publication):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    publication_type = ""
    publisher = ""
    
    # publication title
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title"):
        publication_title = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title").string

    # publication year
    if publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item"):
        publication_year_string = publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item").string
        publication_year = re.search(r"([0-9]{4})", publication_year_string).group(1) # only get year
    
    # authors
    if publication.find("span", class_="nova-legacy-v-person-inline-item__fullname"):
        publication_author_list = publication.find_all("span", class_="nova-legacy-v-person-inline-item__fullname")
        authors_list = [x.text for x in publication_author_list]
    
    # publication type
    if publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge"):
        publication_type = publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge").text
    
    # publisher
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
    .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare"):

        publication_link = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
        .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare")['href']
        
        # sometimes the 
        if not publication_link.startswith("https://www.researchgate.net/"):
            publication_link = "https://www.researchgate.net/" + publication_link
            
        publication_profile = BeautifulSoup(get_source(publication_link).html.raw_html)
        
        publisher = get_publisher(publication_profile, publication_type)
        isbn, doi = get_identifier(publication_profile)
    
    return publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi

In [32]:
def get_dataframe_with_publications(author_name):
    author_profile = get_author_profile(author_name)
    
    publication_titles = []
    publication_years = []
    authors_lists = []
    publication_types = []
    publisher_list = []
    author_name_list = []
    isbn_list = []
    doi_list = []

    
    if author_profile:
        all_publication_entries = get_all_publications(author_profile)
        
    for publication_entry in all_publication_entries:
        publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi = get_publication_info(publication_entry)

        publication_titles.append(publication_title)
        publication_years.append(publication_year)
        authors_lists.append(authors_list)
        publication_types.append(publication_type)
        author_name_list.append(author_name)
        publisher_list.append(publisher)
        isbn_list.append(isbn)
        doi_list.append(doi)
    
    
    data = {
            'AUTHOR_NAME': author_name_list,
            'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years, 
            'PUB_AUTHORS': authors_lists,  
            'PUB_TYPE': publication_types,
            'PUB_PUBLISHER': publisher_list,
            'PUB_ISBN': isbn_list,
            'PUB_DOI': doi_list
           }
    
    return pd.DataFrame(data)

In [33]:
def get_dataframe_with_publications_no_profile(author_name):
    
    publication_titles = []
    publication_years = []
    authors_lists = []
    publication_types = []
    publisher_list = []
    author_name_list = []
    isbn_list = []
    doi_list = []
    
    keep_going = True
    page_number = 1
    
    while keep_going:
        page_with_publications = get_author_publication_list(author_name, page_number)
        page_number = page_number + 1 # go to next page
        
        all_publication_entries = get_all_publications(page_with_publications)
        
        if not all_publication_entries:
            keep_going = False
            continue
        
        for publication_entry in all_publication_entries:
            publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi = get_publication_info(publication_entry)
        
            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            authors_lists.append(authors_list)
            publication_types.append(publication_type)
            author_name_list.append(author_name)
            publisher_list.append(publisher)
            isbn_list.append(isbn)
            doi_list.append(doi)
    
    
    data = {
            'AUTHOR_NAME': author_name_list,
            'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years, 
            'PUB_AUTHORS': authors_lists,  
            'PUB_TYPE': publication_types,
            'PUB_PUBLISHER': publisher_list,
            'PUB_ISBN': isbn_list,
            'PUB_DOI': doi_list
           }
    
    return pd.DataFrame(data)

In [34]:
# get HTML response for author
#author_name = "Stephan Daurer"
author_names = ["Stephan Daurer", "Gerhard Hellstern", "Michael Bächle", "Thomas Asche", "Oliver Bährle", "Wolfgang Bihler"]

pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                                  'PUB_AUTHORS', 'PUB_TYPE', 'PUB_PUBLISHER', 
                                                  'PUB_ISBN', 'PUB_DOI' ])

for author_name in author_names:
    # to pretend this is not a robot
    author_query = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss").replace(" ", "%")
    get_source(f"https://www.researchgate.net/search.Search.html?type=researcher&query={author_query}")
    
    # real query
    pd_publications = get_dataframe_with_publications(author_name)
    
    if pd_publications.empty:
        pd_publications = get_dataframe_with_publications_no_profile(author_name)
        
        if pd_publications.empty:
            print(f"No publications found for {author_name}.")
            
    pd_result = pd_result.append(pd_publications)
        

No publications found for Oliver Bährle.


In [35]:
pd_result

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,PUB_ISBN,PUB_DOI
0,Stephan Daurer,Einführung in die Wirtschaftsinformatik: Ein f...,2021,"[Michael A. Bächle, Stephan Daurer, Arthur Kolb]",Book,De Gruyter Oldenbourg,9783110722253,10.1515/9783110722260
1,Stephan Daurer,Do‐It‐Yourself as a Means for Making Assistive...,2021,"[Tobias Mettler, Stephan Daurer, Michael A. Bä...",Article,Information Systems Journal,,10.1111/isj.12352
2,Stephan Daurer,Zentrum für Digitale Innovationen,2020,"[Stephan Daurer, Gerhard Hellstern, Petra Radke]",Poster,"DHBW Forschungstag 2020, Mosbach",,
3,Stephan Daurer,Die Aufklärung und das Web 2.0,2019,"[Michael A. Bächle, Stephan Daurer]",Article,Wirtschaftsinformatik & Management,,10.1365/s35764-019-00172-y
4,Stephan Daurer,Application of Media Synchronicity Theory to C...,2019,"[Lukas Furmanek, Stephan Daurer]",Conference Paper,14. International Conference on Wirtschaftsinf...,,
...,...,...,...,...,...,...,...,...
0,Thomas Asche,Atomistic simulation of sol-gel-derived hybrid...,2018,"[Thomas Sebastian Asche, M. Duderstaedt, P. Be...",Chapter,,,10.1007/978-3-319-32101-1_109
1,Thomas Asche,Validation of the COMPASS force field for comp...,2017,"[Thomas Sebastian Asche, Peter Behrens, Andrea...",Article,Journal of Sol-Gel Science and Technology,,10.1007/s10971-016-4185-y
2,Thomas Asche,Atomistic Simulation of Sol–Gel-Derived Hybrid...,2016,"[Thomas Sebastian Asche, Mirja Duderstaedt, Pe...",Chapter,,,10.1007/978-3-319-19454-7_109-1
3,Thomas Asche,Two-photon polymerization of inorganic-organic...,2015,"[F. Burmeister, Sönke Steenhusen, Ruth Houbert...",Chapter,,,10.1515/9783110354324-016


In [None]:
if STORE_PUB_RG:
    pd_result.to_csv(f'../data/{current_year}/publications_test_rg.csv')