In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import requests_html
import os
from requests_html import AsyncHTMLSession
from requests_html import HTMLSession
import glob
import datetime
import numpy as np
import re

In [2]:
# get current year to save in applicable folder and for the year_stored column
current_year = datetime.date.today().year

In [3]:
def add_new_employee_to_employees(employees, name, job_title, location):
    #get the clean name (in format of "first_name surname") without the title
    clean_name = get_name_without_title(name, location)
    #create the row for the new employee
    new_employee = {"employee_name" : name, "first_matched_job_title" : job_title, "employee_name_clean": clean_name, "location":location, "year_stored":current_year}
    #add new employee at the end of the employees data frame
    employees.loc[0 if pd.isnull(employees.index.max()) else employees.index.max() + 1] = new_employee
    return employees

In [4]:
#get the url for the contact persons based on the location
def get_url(location):
    if location == "stuttgart":
        base_url = "https://www.dhbw-" + location + ".de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen/"
    elif location == "campus-horb":
        base_url = "https://www.dhbw-" + "stuttgart" + ".de"
        url = base_url + "/horb/" + location + "/ansprechpersonen/"
    elif location == "loerrach":
        base_url = "https://www.dhbw-" + "loerrach" + ".de"
        url = base_url + "/ansprechpersonen/"
    elif location == "mosbach":
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/dhbw-" + location + "/who-is-who"
    elif location == "villingen-schwenningen":
        base_url = "https://www.dhbw-vs.de"
        url = base_url + "/hochschule/mitarbeitende.html"
    elif location == "heilbronn":
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/ueber-uns/team.html"
    else:
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen"
    return url, base_url

In [5]:
def get_name_without_title(name, location):
    
    employee_name_splitted = name.split(" ")
    employee_name_clean = ''
    
    for part in employee_name_splitted:
        if "." in part:
            continue
        else:
            employee_name_clean += part.strip() + " "
    if( location in ["stuttgart", "campus-horb", "loerrach", "mosbach", "villingen-schwenningen", "heilbronn"]):
        return employee_name_clean.replace(",","").strip()
    else:
        employee_name_splitted = employee_name_clean.split(",")
        employee_name_clean = employee_name_splitted[1].strip() + " " + employee_name_splitted[0].strip()
        return employee_name_clean

In [6]:
def get_matched_title_of_name(employee_name):
    if (any( (match := x) in employee_name.lower() for x in name_title_match)):
        return match.strip()
    else:
        return ''

In [7]:
def get_matched_job_title(job_title):
    if (any( (match := x) in job_title.strip().lower() for x in job_title_match)):
        return match.strip()
    else:
        return ''

In [8]:
def add_others_to_employees(results, employees, location):
    for person in results:
        #split the text to get the name and the job titles for the person
        if( (location in ["mosbach", "villingen-schwenningen", "heilbronn"])):
            splitted_person = person.strip().split("\n")
        else:    
            splitted_person = person.get_text().strip().split("\n")
        
        #skip if the split has only a length of 1 or below. means that there is no job title --> therefore irrelevant
        if(len(splitted_person)>1):
            job_titles = filter(None, splitted_person[2:])

            employee_name = splitted_person[0].strip()
            
            #get the first matched title for the person
            for job_title in job_titles:
                title_match = get_matched_job_title(job_title)
                if (title_match):
                    break

            title_match_name = get_matched_title_of_name(employee_name)
            
            #always prefer the job title over an title match in a name
            if (title_match):
                employees = add_new_employee_to_employees(employees, employee_name, title_match, location)
            elif (title_match_name):
                employees = add_new_employee_to_employees(employees, employee_name, title_match_name, location)
            
    return employees

In [9]:
#check if directory already exists, if not create the directory
def check_dir(file_name):
    directory = os.path.dirname(file_name)
    if not os.path.exists(directory):
        os.makedirs(directory)

### employees for Ravensburg, Mannheim, Heidenheim, Karlsruhe, Campus-Horb, Stuttgart

In [10]:
locations = ["ravensburg", "mannheim", "heidenheim", "karlsruhe", "loerrach", "mosbach", "villingen-schwenningen", "heilbronn", "campus-horb", "stuttgart"]
#key words to identify relevant job titles
job_title_match = ["akademisch", "professor", "studiengangsleiter", "wissenschaftlich", "studiengangsleitung", "prof.*", "researcher"]
#key words to identify relevant name titles
name_title_match = ["prof.", "dr."]

#use the async session to retrieve the html data (else it is in a comment section so one can't navigate through it)
session = AsyncHTMLSession()

In [11]:
for location in locations:
    #define the employees data frame where all employees of one location are stored in
    employees = pd.DataFrame(data=None, columns=["employee_name", "first_matched_job_title", "employee_name_clean", "location", "year_stored"])
    
    url, base_url = get_url(location)
    
    r = await session.get(url)
    await r.html.arender()
    
    page_soup = BeautifulSoup(r.html.html, "html.parser")
    
    #for stuttgart and campus-horb the job title is only available on an extra site for the person
    if((location == 'stuttgart') | (location == "campus-horb")):
        results = page_soup.find_all('span', class_="name")
        
        for result in results:
            s = result.find('a', href=True)

            if s['href'].startswith(base_url):
                person_url = s['href']
            else:
                person_url = base_url + s['href']    
            
            r = await session.get(person_url)
            await r.html.arender()
            soup = BeautifulSoup(r.html.html, "html.parser")

            try:
                person_name = soup.find(attrs={"itemprop":"name"}).string
                person_job_title = soup.find(attrs={"itemprop":"jobTitle"}).string
            except:
                print("exception for url: " + person_url)
                continue

            title_match = get_matched_job_title(person_job_title)
            title_match_name = get_matched_title_of_name(person_name)
        
            if (title_match):
                employees = add_new_employee_to_employees(employees, person_name, title_match, location)
            elif (title_match_name):
                employees = add_new_employee_to_employees(employees, person_name, title_match_name, location)
        
    else:
        if((location == 'ravensburg') | (location == 'heidenheim')):
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed", "data-parent":"#accordion-dhbwcontacts-az"})
        elif(location == 'loerrach'):
            people = page_soup.find_all('div', class_="panel-title")
        elif(location == 'mosbach'):    
            result_names = page_soup.find_all('div', class_="card-name")
            result_titles = page_soup.find_all('div', class_="card-extra")
            people = [name.get_text() + "\n\n" + title.get_text("\n") for name, title in zip(result_names, result_titles)]
        elif(location == 'villingen-schwenningen'):
            results = page_soup.find_all('div', class_ = "textcontainer")
            people = [result.get_text().strip() for result in results]
        elif(location == 'heilbronn'):
            results = page_soup.find_all('div', attrs={"class": "box-0 content-textpic plugin-"})
            people = [result.get_text("\n").strip() for result in results]
        else:
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed"})
        employees = add_others_to_employees(people, employees, location)

    file_name = f'../data/{current_year}/employees_{location}.csv'
    check_dir(file_name)
    employees.to_csv(file_name, index=False)

### create a csv file with all relevant employees of all locations and years

In [104]:
files_in_data = os.listdir("../data")

p = re.compile('[0-9]{4}')
year_list = [ s for s in files_in_data if p.match(s)]
year_list

['2021', '2022']

In [119]:
all_employees = pd.DataFrame(data=None, columns=["employee_name", "first_matched_job_title", "employee_name_clean", "location", "year_stored"])

for year in year_list:
    year_path = f'../data/{year}'

    all_employee_files = glob.glob(year_path + "/employees_*.csv")

    for filename in all_employee_files:
        employee_df = pd.read_csv(filename, index_col=None, header=0)
        all_employees = pd.concat([all_employees, employee_df], ignore_index=True)

file_name_all_employees = f'../data/employees_all_sites_all_years.csv'
check_dir(file_name_all_employees)
all_employees.to_csv(file_name_all_employees, index=False)