In [1]:
#!pip install nest_asyncio

In [2]:
# change to store dataset
STORE_PUB_RG = True

# change for different location
# locations: "ravensburg", "mannheim", "heidenheim", "karlsruhe", "campus-horb", "stuttgart"

location = "stuttgart"

In [3]:
import requests
import urllib
from pathlib import Path
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup
import re
import time
import random
import datetime

In [4]:
# get current year to save in applicable folder
current_year = datetime.date.today().year - 1

# open session for html requests - deprecated
#SESSION = HTMLSession()

In [5]:
# get author names
pd_employees = pd.read_csv(f'../data/{current_year}/employees_{location}.csv')
author_names = pd_employees['employee_name_clean'].tolist()

In [6]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """
    random_wait_time = random.randint(5,15)
    time.sleep(random_wait_time) # prevents captcha hopefully

    try:
        # reopen session each time to prevent timeout
        SESSION = HTMLSession()
        response = SESSION.get(url)
        
        if response.status_code == 429:
            raise requests.exceptions.RequestException(f"Too many requests have been made. Stopped at url: {url}")
        
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [7]:
def get_author_profile(author_name):
    
    # get HTML response for author
    author_name = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    query = author_name.replace(" ", "-")
    response = get_source("https://www.researchgate.net/profile/" + query)
    soup = BeautifulSoup(response.html.raw_html)
        
    return soup

In [8]:
def get_author_publication_list(author_name, page):
    
    # get HTML response for author
    author_name = author_name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    query = author_name.replace(" ", "+")
    response = get_source(f"https://www.researchgate.net/search/publication?q=\"{query}\"&page={page}")
    soup = BeautifulSoup(response.html.raw_html)
    
    return soup

In [9]:
def get_all_publications(page):
    return page.find_all("div", class_="nova-legacy-v-publication-item__body")

In [10]:
def get_publisher(publication_profile, publication_type):
    
    metadata = publication_profile.find("div", class_="research-detail-header-section__metadata")

    publisher = None
    
    if not metadata: # don't search for publisher if no metadata
        pass
    
    elif publication_type == 'Article':

        if metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated"):
            publisher = metadata.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-decorated").text

    elif publication_type in ['Book', 'Chapter']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if re.match("^Publisher: [ \wäöüß]+$", meta.text):
                    publisher = meta.text.split("Publisher: ",1)[1]

    elif publication_type in ['Poster', 'Conference Paper']:

        if metadata.find("li", class_="nova-legacy-e-list__item"):

            # there are multiple items with that class
            for meta in metadata.find_all("li", class_="nova-legacy-e-list__item"):
                if re.match("^Conference: [ \wäöüß]+$", meta.text):
                    publisher = meta.text.split("Conference: ",1)[1]
                    
    return publisher

In [11]:
def get_identifier(publication_profile):
    
    metadata = publication_profile.find("div", class_="research-detail-header-section__metadata")

    isbn = None
    doi = None
    
    if not metadata: # don't search for identifier if no metadata
        pass
    

    if metadata.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-xxs nova-legacy-e-text--color-grey-700"):

        # there are multiple items with that class
        for meta in metadata.find_all("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-xxs nova-legacy-e-text--color-grey-700"):
            if "ISBN" in meta.text:
                isbn = meta.text.split("ISBN:",1)[1]
                
            elif "DOI" in meta.text:
                doi = meta.text.split("DOI:",1)[1]
       
    return isbn, doi

In [12]:
def get_publication_info(publication):
    
    publication_title = ""
    publication_year = ""
    authors_list = []
    publication_type = ""
    publisher = ""
    
    # publication title
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title"):
        publication_title = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title").string

    # publication year
    if publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item"):
        publication_year_string = publication.find("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item").string
        publication_year = re.search(r"([0-9]{4})", publication_year_string).group(1) # only get year
    
    # authors
    if publication.find("span", class_="nova-legacy-v-person-inline-item__fullname"):
        publication_author_list = publication.find_all("span", class_="nova-legacy-v-person-inline-item__fullname")
        authors_list = [x.text for x in publication_author_list]
    
    # publication type
    if publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge"):
        publication_type = publication.find("span", class_= "nova-legacy-e-badge nova-legacy-e-badge--color-green nova-legacy-e-badge--display-block nova-legacy-e-badge--luminosity-high nova-legacy-e-badge--size-l nova-legacy-e-badge--theme-solid nova-legacy-e-badge--radius-m nova-legacy-v-publication-item__badge").text
    
    # publisher
    if publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
    .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare"):

        publication_link = publication.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title")\
        .find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare")['href']
        
        # sometimes the 
        if not publication_link.startswith("https://www.researchgate.net/"):
            publication_link = "https://www.researchgate.net/" + publication_link
            
        publication_profile = BeautifulSoup(get_source(publication_link).html.raw_html)
        
        publisher = get_publisher(publication_profile, publication_type)
        isbn, doi = get_identifier(publication_profile)
    
    return publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi

In [13]:
def get_dataframe_with_publications(author_name):
    author_profile = get_author_profile(author_name)
    
    publication_titles = []
    publication_years = []
    authors_lists = []
    publication_types = []
    publisher_list = []
    author_name_list = []
    isbn_list = []
    doi_list = []
    updated_list = []

    
    if author_profile:
        all_publication_entries = get_all_publications(author_profile)
        
    for publication_entry in all_publication_entries:
        publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi = get_publication_info(publication_entry)

        publication_titles.append(publication_title)
        publication_years.append(publication_year)
        authors_lists.append(authors_list)
        publication_types.append(publication_type)
        author_name_list.append(author_name)
        publisher_list.append(publisher)
        isbn_list.append(isbn)
        doi_list.append(doi)
        updated_list.append(datetime.date.today())
    
    
    data = {
            'AUTHOR_NAME': author_name_list,
            'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years, 
            'PUB_AUTHORS': authors_lists,  
            'PUB_TYPE': publication_types,
            'PUB_PUBLISHER': publisher_list,
            'PUB_ISBN': isbn_list,
            'PUB_DOI': doi_list,
            'UPDATED': updated_list
           }
    
    return pd.DataFrame(data)

In [14]:
def get_dataframe_with_publications_no_profile(author_name):
    
    publication_titles = []
    publication_years = []
    authors_lists = []
    publication_types = []
    publisher_list = []
    author_name_list = []
    isbn_list = []
    doi_list = []
    updated_list = []
    
    keep_going = True
    page_number = 1
    
    while keep_going:
        page_with_publications = get_author_publication_list(author_name, page_number)
        page_number = page_number + 1 # go to next page
        
        all_publication_entries = get_all_publications(page_with_publications)
        
        if not all_publication_entries:
            keep_going = False
            continue
        
        for publication_entry in all_publication_entries:
            publication_title, publication_year, authors_list, publication_type, publisher, isbn, doi = get_publication_info(publication_entry)
        
            publication_titles.append(publication_title)
            publication_years.append(publication_year)
            authors_lists.append(authors_list)
            publication_types.append(publication_type)
            author_name_list.append(author_name)
            publisher_list.append(publisher)
            isbn_list.append(isbn)
            doi_list.append(doi)
            updated_list.append(datetime.date.today())
    
    
    data = {
            'AUTHOR_NAME': author_name_list,
            'PUB_TITLE': publication_titles, 
            'PUB_YEAR': publication_years, 
            'PUB_AUTHORS': authors_lists,  
            'PUB_TYPE': publication_types,
            'PUB_PUBLISHER': publisher_list,
            'PUB_ISBN': isbn_list,
            'PUB_DOI': doi_list,
            'UPDATED': updated_list
           }
    
    return pd.DataFrame(data)

In [15]:
# get the names that are alrady in the database
outcome_file = Path(f'../data/{current_year}/publications_{location}_rg.csv', index=False)

pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                                  'PUB_AUTHORS', 'PUB_PUBLISHER', 'PUB_TYPE',
                                                  'PUB_ISBN', 'PUB_DOI', 'UPDATED'])

# don't use names that are recent enough
if outcome_file.is_file():
    pd_result = pd.read_csv(f'../data/{current_year}/publications_{location}_rg.csv')
    authors_already_recent = []
    
    # update authors that have been updated more than 30 days ago
    for author in pd_result["AUTHOR_NAME"].unique():
        if datetime.datetime.strptime(pd_result[pd_result['AUTHOR_NAME'] == author].iloc[0]["UPDATED"], "%Y-%m-%d").date() > (datetime.date.today() - datetime.timedelta(days=60)):
            authors_already_recent.append(author)
    
    author_names = list(set(author_names) - set(authors_already_recent))

In [16]:
# loop through authors and collect publications in dataframe

for author_name in author_names:
    
    # real query
    pd_publications = get_dataframe_with_publications(author_name)
    
    if pd_publications.empty:
        pd_publications = get_dataframe_with_publications_no_profile(author_name)
        
        if pd_publications.empty:
            print(f"No publications found for {author_name}.")
            
    pd_result = pd_result.append(pd_publications, ignore_index=True, sort=False)

    # save each time since it breaks all the time
    pd_result = pd_result.reset_index(drop=True)
    pd_result.to_csv(f'../data/{current_year}/publications_{location}_rg.csv', index=False)

No publications found for Jürgen Gundrum.
Too many requests have been made. Stopped at url: https://www.researchgate.net/publication/273538940_Characterization_of_a_continuous_micro-scale_pilot_unit_for_petroleum_residue_hydroconversion_with_dispersed_catalysts_Hydrodynamics_and_performances_in_once-through_and_recycling_mode?_sg=ZwwhzVmJQ49zPFG0sDogkOct-ovAZm2tri8dD3uye3VK9CUm4L6ztbKerx3Z6Nl4ssxfGiq9Fu62NXU


AttributeError: 'NoneType' object has no attribute 'html'

In [17]:
if STORE_PUB_RG:
    pd_result.to_csv(f'../data/{current_year}/publications_{location}_rg.csv', index=False)

In [18]:
pd_result

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_PUBLISHER,PUB_TYPE,PUB_ISBN,PUB_DOI,UPDATED
0,Wolf Burger,Heparin Inhibits Leukocyte Rolling in Pial Ves...,1997.0,"[Joerg R Weber, Klemens Angstwurm, Dorette Fre...",Journal of cerebral blood flow and metabolism:...,Article,,10.1097/00004647-199711000-00011,2022-03-25
1,Wolf Burger,Histamine (H1) receptor antagonist inhibits le...,1997.0,"[Joerg R Weber, Klemens Angstwurm, Wolf Bürger...",Neuroscience Letters,Article,,10.1016/S0304-3940(97)00233-4,2022-03-25
2,Wolf Burger,Pneumococcal cell wall components induce nitri...,1996.0,"[Joerg R Weber, Wolf Bürger, Dorette Freyer, M...",Glia,Article,,10.1002/(SICI)1098-1136(199601)16:1<1::AID-GLI...,2022-03-25
3,Wolf Burger,Anti ICAM-1 (CD 54) monoclonal antibody reduce...,1996.0,"[Joerg R Weber, Klemens Angstwurm, Wolf Bürger...",Journal of Neuroimmunology,Article,,10.1016/0165-5728(95)00131-X,2022-03-25
4,Wolf Burger,[Determination of human serum CRP using a chic...,1996.0,"[Rüdiger Schade, W Bürger, Antje Rieger, Falk ...",ALTEX,Article,,,2022-03-25
5,Wolf Burger,[Avian egg yolk antibodies. The egg laying cap...,1994.0,"[Rüdiger Schade, W Bürger, Torsten Schöneberg,...",ALTEX,Article,,,2022-03-25
6,Wolf Burger,Über substituierte Methan-phosphonsäureester.,2022.0,[Wolf. Bürger],,Article,,,2022-03-25
7,Wolf Burger,Das C-reaktive Protein : Bindungsverhalten und...,2022.0,[Wolf. Bürger],,Article,,,2022-03-25
8,Wolf Burger,"α-Halogenäther, XXXI. Michaelis-Arbusow-Reakti...",1967.0,"[H. GROSS, Günter Engelhardt, Jürgen Freiberg,...",European Journal of Organic Chemistry,Article,,10.1002/jlac.19677070108,2022-03-25
9,Alfred Geisel,A MATLAB Toolbox for Structural Analysis of Li...,2019.0,"[Alfred Geisel, Ferdinand Svaricek]",IFAC-PapersOnLine,Article,,10.1016/j.ifacol.2019.11.018,2022-03-25
