In [None]:
#!pip install urllib3 --user
#!pip install requests_html --user
#!pip install cffi --user --version 1.12.3

In [8]:
import requests
import bs4
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import requests_html
from requests_html import AsyncHTMLSession
from requests_html import HTMLSession
import glob
import datetime

In [11]:
def add_new_employee_to_employees(employees, name, job_title, location):
    clean_name = get_name_without_title(name, location)
    new_employee = {"employee_name" : name, "first_matched_job_title" : job_title, "employee_name_clean": clean_name, "location":location}
    employees = employees.append(new_employee, ignore_index=True)
    return employees

In [12]:
def get_url(location):
    if location == "stuttgart":
        base_url = "https://www.dhbw-" + location + ".de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen/"
    elif location == "campus-horb":
        base_url = "https://www.dhbw-" + "stuttgart" + ".de"
        url = base_url + "/horb/" + location + "/ansprechpersonen/"
    else:
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen"
    return url, base_url

In [13]:
def get_name_without_title(name, location):
    
    employee_name_splitted = name.split(" ")
    employee_name_clean = ''
    
    for part in employee_name_splitted:
        if "." in part:
            continue
        else:
            employee_name_clean += part.strip() + " "
    if((location == "stuttgart") | (location == "campus-horb")):
        return employee_name_clean.replace(",","").strip()
    else:
        employee_name_splitted = employee_name_clean.split(",")
        employee_name_clean = employee_name_splitted[1].strip() + " " + employee_name_splitted[0].strip()
        return employee_name_clean

In [14]:
def add_others_to_employees(results, employees, location):
    for person in results:
        splitted_person = person.get_text().strip().split("\n")

        for job_title in splitted_person[2:]:
            job_title_formatted = job_title.strip().lower()
            if any(x in job_title_formatted for x in job_title_match):
                employees = add_new_employee_to_employees(employees, splitted_person[0], job_title.strip(), location)
                break
    return employees

### employees for Ravensburg, Mannheim, Heidenheim, Karlsruhe, Campus-Horb, Stuttgart

In [15]:
locations = ["ravensburg", "mannheim", "heidenheim", "karlsruhe", "campus-horb", "stuttgart"]
#locations = ["campus-horb"]
job_title_match = ["akademisch", "professor", "studiengangsleiter", "wissenschaftlich", "studiengangsleitung", "prof.*"]

#session = HTMLSession()
session = AsyncHTMLSession()

# get current year to save in applicable folder
current_year = datetime.date.today().year

In [16]:
for location in locations:
    employees = pd.DataFrame(data=None, columns=["employee_name", "first_matched_job_title", "employee_name_clean"])
    
    url, base_url = get_url(location)
    
    r = await session.get(url)
    await r.html.arender()
    
    page_soup = BeautifulSoup(r.html.html, "html.parser")
    
    if((location == 'stuttgart') | (location == "campus-horb")):
        results = page_soup.find_all('span', class_="name")
        
        for result in results:
            s = result.find('a', href=True)

            if s['href'].startswith(base_url):
                person_url = s['href']
            else:
                person_url = base_url + s['href']    
            
            r = await session.get(person_url)
            await r.html.arender()
            soup = BeautifulSoup(r.html.html, "html.parser")

            try:
                person_name = soup.find(attrs={"itemprop":"name"}).string
                person_job_title = soup.find(attrs={"itemprop":"jobTitle"}).string
            except:
                print("exception for url: " + person_url)
                continue

            if any(x in person_job_title.lower() for x in job_title_match):
                employees = add_new_employee_to_employees(employees, person_name, person_job_title.strip(), location)
        
    else:
        if((location == 'ravensburg') | (location == 'heidenheim')):
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed", "data-parent":"#accordion-dhbwcontacts-az"})
        else:
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed"})
        employees = add_others_to_employees(people, employees, location)

    employees.to_csv(f'../data/{current_year}/employees_{location}.csv', index=False)

exception for url: https://www.dhbw-stuttgart.de/dhbw-stuttgart/organisation/rektorat/


In [17]:
path = f'../data/{current_year}' # use your path
all_files = glob.glob(path + "/employees_*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.head()

Unnamed: 0,employee_name,first_matched_job_title,employee_name_clean,location
0,Prof. Dipl.-Ing. Wolf Burger,Prof.* für Lehraufgaben,Wolf Burger,campus-horb
1,"Prof. Dipl.-Ing. Alfred Geisel, M.Sc.",Studiengangsleitung,Alfred Geisel,campus-horb
2,Prof. Dr.-Ing. Joachim Grill,Studiengangsleitung,Joachim Grill,campus-horb
3,Prof. Dr.-Ing. Jürgen Gundrum,Studiengangsleitung,Jürgen Gundrum,campus-horb
4,Dr.-Ing. Jens Häcker,Vertretungsprofessor,Jens Häcker,campus-horb


### test purposes

In [None]:
base_url = "https://www.dhbw-stuttgart.de"
test_url = base_url + "/dhbw-stuttgart/ansprechpersonen/"

page_soup = get_page_of_url(test_url)
results = page_soup.find_all('span', class_="name")

print(results)


In [None]:
employees = pd.DataFrame(columns=["employee", "first_matched_job_title"])
for result in results:
    s = result.find('a', href=True)
    
    if s['href'].startswith(base_url):
        person_url = s['href']
    else:
        person_url = base_url + s['href']
    print(person_url)
    
    
    soup = get_page_of_url(person_url)
    
    try:
        person_name = soup.find(attrs={"itemprop":"name"}).string
        person_job_title = soup.find(attrs={"itemprop":"jobTitle"}).string
    except:
        print("exception")
        continue
    
    if any(x in person_job_title.lower() for x in job_title_match):        
        new_employee = {"employee_name" : person_name, "first_matched_job_title" : person_job_title.strip()}
        employees = employees.append(new_employee, ignore_index=True)
    
employees.head()
employees.to_csv(f'../data/{current_year}/employees_{location}.csv', index=False)

In [None]:
employees.head()

In [None]:
job_title_match = ["akademisch", "professor", "studiengangsleiter", "wisschenschaftl"]

employees = pd.DataFrame(columns=["employee", "first_matched_job_title"])

for result in results:
    s = result.get_text().strip().split("\n")
    print(s)
    for item in s[2:]:
        item_formatted = item.strip().lower()
        if item != None and any(x in item_formatted for x in job_title_match):
            employees = employees.append({"employee" : s[0], "first_matched_job_title" : item.strip()}, ignore_index=True)
            break

employees.head()