In [1]:
import requests
from requests_html import HTML
from requests_html import HTMLSession

import pandas as pd

import os

import datetime

from bs4 import BeautifulSoup
from bs4 import NavigableString

In [2]:
def get_source(url):
    """Return the source code for the provided URL. 
    Args: 
        url (string): URL of the page to scrape.
    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [3]:
def get_content(page_num):
    # get website content
    response = get_source("https://www.karlsruhe.dhbw.de/forschung-transfer/publikationen.html?tx_nwcitavife_citavilist%5Baction%5D=list&tx_nwcitavife_citavilist%5Bcontroller%5D=Reference&tx_nwcitavife_citavilist%5BcurrentPage%5D=" + page_num)
    soup = BeautifulSoup(response.html.raw_html)
    return soup

In [4]:
def get_content_after_tag(first_tag):
    content_after_tag = []
    for element in first_tag.next_elements:
        content_after_tag.append(repr((element)))
    return content_after_tag

In [5]:
def add_publication_to_df(df, pub_author, pub_title, pub_year, pub_authors, pub_type, pub_publisher, pub_isbn, pub_doi):
    
    new_pub = {'AUTHOR_NAME': pub_author,
               'PUB_TITLE': pub_title,
               'PUB_YEAR': pub_year,
               'PUB_AUTHORS': pub_authors,
               'PUB_TYPE': pub_type,
               'PUB_PUBLISHER': pub_publisher,
               'PUB_ISBN': pub_isbn,
               'PUB_DOI': pub_doi,
               'PUB_CITATIONS': '',
               'JOU_RATING_VHB': '',
               'JOU_RATING_SCIMAGO': '',
               'UPDATED': ''
              }
    df = df.append(new_pub, ignore_index=True)
    return df

In [6]:
def clean_authors(authors):
    cleaned_authors = []
    splitted_authors = authors.split(';')
    for s_a in splitted_authors:
        if(',' in s_a):
            clean_author = s_a.split(',')
            c_a = clean_author[1].strip() + " " + clean_author[0].strip()
            cleaned_authors.append(c_a)
        else:
            cleaned_authors.append(s_a)
    return cleaned_authors

In [7]:
def get_matched_author(clean_authors):
    karlsruhe_employees = pd.read_csv (f'../data/2022/employees_karlsruhe.csv')
    employee_list = karlsruhe_employees['employee_name_clean'].tolist()
    
    for author in clean_authors:
        if (author in employee_list):
            return author


In [8]:
def check_dir(file_name):
    directory = os.path.dirname(file_name)
    if not os.path.exists(directory):
        os.makedirs(directory)

In [9]:
# get current year to save in applicable folder
current_year = datetime.date.today().year

In [10]:
df = pd.DataFrame(data=None, columns=['AUTHOR_NAME','PUB_TITLE','PUB_YEAR','PUB_AUTHORS','PUB_TYPE','PUB_PUBLISHER','PUB_ISBN','PUB_DOI','PUB_CITATIONS','JOU_RATING_VHB','JOU_RATING_SCIMAGO','UPDATED'])

i = 1
previous_page_pubs = []

while True:
    soup = get_content(str(i))
    content = get_content_after_tag(soup.find('li', class_='last'))
    
    content_html = BeautifulSoup(content[7], 'html.parser')
    pubs = content_html.find_all('li')
    
    if(pubs == previous_page_pubs):
        break
    else:
        previous_page_pubs = pubs
    
    i += 1


    for pub in pubs:
        pub_authors = ''
        pub_title = ''
        pub_subtitle = ''
        pub_year = ''
        pub_doi = ''
        pub_publishers = ''

        end_pub_type = str(pub).find('"', 11)
        pub_type = str(pub)[11:end_pub_type]
        if pub.find('span', class_='authors'): pub_authors = pub.find('span', class_='authors').get_text()
        if pub.find('span', class_='title'): pub_title = pub.find('span', class_='title').get_text()[2:]
        if pub.find('span', class_='subtitle'): pub_subtitle = pub.find('span', class_='subtitle').get_text()
        if pub.find('span', class_='bookYear'): pub_year = pub.find('span', class_='bookYear').get_text()
        elif pub.find('span', class_='sortDate'): pub_year = pub.find('span', class_='sortDate').get_text()
        elif pub.find('span', class_='bookDate'): pub_year = pub.find('span', class_='bookDate').get_text()
        if pub.find('a', class_='doi'): pub_doi = pub.find('a', class_='doi').get_text()
        if pub.find('span', class_='publishers'): pub_publishers = pub.find('span', class_='publishers').get_text()[2:]
        
        pub_year = pub_year.strip()[1:5]
        
        df = add_publication_to_df(df, '', pub_title, pub_year, pub_authors, pub_type, pub_publishers, '', pub_doi)

df['PUB_AUTHORS_LIST'] = df.apply(lambda row: clean_authors(row.PUB_AUTHORS), axis=1)
df['AUTHOR_NAME'] = df.apply(lambda row: get_matched_author(row.PUB_AUTHORS_LIST), axis=1)

file_name = f'../data/{current_year}/karlsruhe_publications.csv'
check_dir(file_name)
df.to_csv(file_name, index=False)

In [11]:
df.PUB_YEAR.unique()

array(['2021', '', '2020', '2019', '2018', '2017', '2016', '2015', '2014',
       '2013', '02.0', '22.0', '2012', '2011', '2010', '2009', '2008',
       '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000',
       '1999', '1998', '1997', '1996', '1995', '1994', '1993', '1992',
       '1991', '1990', '1989', '1987', '10.J'], dtype=object)

In [12]:
df.PUB_PUBLISHER.unique()

array(['', 'Springer Fachmedien Wiesbaden GmbH', 'Springer',
       'Springer Spektrum', 'Waxmann',
       'Karlsruher Institut für Technologie',
       'Carl Hanser Verlag GmbH & Co. KG',
       'Schäffer-Poeschel Verlag, Schäffer-Poeschel Verlag für Wirtschaft Steuern Recht GmbH',
       'WOTech Verlag', 'Peter Lang', 'Springer Gabler',
       'International Speech Communication Association (ISCA)', 'ISCA',
       'Mathematisches Institut der Universität Göttingen', 'Logos',
       'European Commission', 'Hanser', 'Haufe-Lexware GmbH & Co. KG',
       'Hochschule Fresenius', 'DHBW Karlsruhe',
       'VS Verl. für Sozialwiss', 'IF Verlag', 'Haufe-Mediengruppe',
       'Haufe Verlag', 'InWent', 'Springer International',
       'Verlag Dr. Kovac', 'Verlag Versicherungswirtschaft',
       'Handelsblatt Media Group (Deutschland)', 'Lang', 'Gabler Verlag',
       'Verl. für Dt. Steuerberater', 'Mohr Siebeck Verlag',
       'Springer Verlag', 'Gesellschaft für Informatik e.V.',
       'Bert

In [13]:
#df.loc[~ df['PUB_TITLE'].str.startswith(': ', na=False)]
#df.sample(25)
df.head()

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,PUB_ISBN,PUB_DOI,PUB_CITATIONS,JOU_RATING_VHB,JOU_RATING_SCIMAGO,UPDATED,PUB_AUTHORS_LIST
0,Marcus Strand,Special Issue on Intelligent Autonomous Systems,2021.0,"Ang, Marcelo H.; Miura, Jun; Strand, Marcus",JournalArticle,,,DOI: https://doi.org/10.1080/01691864.2021.187...,,,,,"[Marcelo H. Ang, Jun Miura, Marcus Strand]"
1,Angela Diehl-Becker,Comparative Analyses of European Identities in...,2021.0,"Diehl-Becker, Angela",ContributionConferenceProceedings,,,,,,,,[Angela Diehl-Becker]
2,,Ein Schauspiel von Boris Johnson,2021.0,"Enderle, Falk; Interview mit Prof. Dr. Andrew Lee",JournalArticle,,,,,,,,"[Falk Enderle, Interview mit Prof. Dr. Andrew..."
3,Torsten Harms,Generation Z als Anleger - Verhalten und Umgan...,,"Harms, Torsten",Unknown,,,,,,,,[Torsten Harms]
4,Torsten Harms,Equity Branding im Zeitalter der digitalen Kom...,2021.0,"Harms, Torsten",ContributionConferenceProceedings,,,,,,,,[Torsten Harms]
