In [None]:
#!pip install urllib3 --user
#!pip install requests_html --user
#!pip install cffi --user --version 1.12.3

In [72]:
import requests
import bs4
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import requests_html
import os
from requests_html import AsyncHTMLSession
from requests_html import HTMLSession
import glob
import datetime

In [2]:
def add_new_employee_to_employees(employees, name, job_title, location):
    
    clean_name = get_name_without_title(name, location)
    new_employee = {"employee_name" : name, "first_matched_job_title" : job_title, "employee_name_clean": clean_name, "location":location}
    employees = employees.append(new_employee, ignore_index=True)
    return employees

In [180]:
def get_url(location):
    if location == "stuttgart":
        base_url = "https://www.dhbw-" + location + ".de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen/"
    elif location == "campus-horb":
        base_url = "https://www.dhbw-" + "stuttgart" + ".de"
        url = base_url + "/horb/" + location + "/ansprechpersonen/"
    elif location == "loerrach":
        base_url = "https://www.dhbw-" + "loerrach" + ".de"
        url = base_url + "/ansprechpersonen/"
    elif location == "mosbach":
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/dhbw-" + location + "/who-is-who"
    elif location == "villingen-schwenningen":
        base_url = "https://www.dhbw-vs.de"
        url = base_url + "/hochschule/mitarbeitende.html"
    elif location == "heilbronn":
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/ueber-uns/team.html"
    else:
        base_url = "https://www." + location + ".dhbw.de"
        url = base_url + "/dhbw-" + location + "/ansprechpersonen"
    return url, base_url

In [181]:
def get_name_without_title(name, location):
    
    employee_name_splitted = name.split(" ")
    employee_name_clean = ''
    
    for part in employee_name_splitted:
        if "." in part:
            continue
        else:
            employee_name_clean += part.strip() + " "
    if( location in ["stuttgart", "campus-horb", "loerrach", "mosbach", "villingen-schwenningen", "heilbronn"]):
        return employee_name_clean.replace(",","").strip()
    else:
        employee_name_splitted = employee_name_clean.split(",")
        employee_name_clean = employee_name_splitted[1].strip() + " " + employee_name_splitted[0].strip()
        return employee_name_clean

In [182]:
def add_others_to_employees(results, employees, location):
    for person in results:
        
        if( (location in ["mosbach", "villingen-schwenningen", "heilbronn"])):
            splitted_person = person.strip().split("\n")
        else:    
            splitted_person = person.get_text().strip().split("\n")
        
        job_titles = filter(None, splitted_person[2:])
        
        for job_title in job_titles:
            job_title_formatted = job_title.strip().lower()
            if (any(x in job_title_formatted for x in job_title_match)) | (any(x in splitted_person[0].lower() for x in name_title_match)):
                employees = add_new_employee_to_employees(employees, splitted_person[0].strip(), job_title.strip(), location)
                break
    return employees

In [70]:
def check_dir(file_name):
    directory = os.path.dirname(file_name)
    if not os.path.exists(directory):
        os.makedirs(directory)

### employees for Ravensburg, Mannheim, Heidenheim, Karlsruhe, Campus-Horb, Stuttgart

In [188]:
locations = ["ravensburg", "mannheim", "heidenheim", "karlsruhe", "loerrach", "mosbach", "villingen-schwenningen", "heilbronn", "campus-horb", "stuttgart"]
#locations = ["heilbronn"]
job_title_match = ["akademisch", "professor", "studiengangsleiter", "wissenschaftlich", "studiengangsleitung", "prof.*", "researcher"]
name_title_match = ["prof.", "dr."]

#session = HTMLSession()
session = AsyncHTMLSession()

# get current year to save in applicable folder
current_year = datetime.date.today().year

In [189]:
for location in locations:
    employees = pd.DataFrame(data=None, columns=["employee_name", "first_matched_job_title", "employee_name_clean"])
    
    url, base_url = get_url(location)
    
    r = await session.get(url)
    await r.html.arender()
    
    page_soup = BeautifulSoup(r.html.html, "html.parser")
    
    if((location == 'stuttgart') | (location == "campus-horb")):
        results = page_soup.find_all('span', class_="name")
        
        for result in results:
            s = result.find('a', href=True)

            if s['href'].startswith(base_url):
                person_url = s['href']
            else:
                person_url = base_url + s['href']    
            
            r = await session.get(person_url)
            await r.html.arender()
            soup = BeautifulSoup(r.html.html, "html.parser")

            try:
                person_name = soup.find(attrs={"itemprop":"name"}).string
                person_job_title = soup.find(attrs={"itemprop":"jobTitle"}).string
            except:
                print("exception for url: " + person_url)
                continue

            if any(x in person_job_title.lower() for x in job_title_match):
                employees = add_new_employee_to_employees(employees, person_name, person_job_title.strip(), location)
        
    else:
        if((location == 'ravensburg') | (location == 'heidenheim')):
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed", "data-parent":"#accordion-dhbwcontacts-az"})
        elif(location == 'loerrach'):
            people = page_soup.find_all('div', class_="panel-title")
        elif(location == 'mosbach'):    
            result_names = page_soup.find_all('div', class_="card-name")
            result_titles = page_soup.find_all('div', class_="card-extra")
            people = [name.get_text() + "\n\n" + title.get_text("\n") for name, title in zip(result_names, result_titles)]
        elif(location == 'villingen-schwenningen'):
            results = page_soup.find_all('div', class_ = "textcontainer")
            people = [result.get_text().strip() for result in results]
        elif(location == 'heilbronn'):
            results = page_soup.find_all('div', attrs={"class": "box-0 content-textpic plugin-"})
            people = [result.get_text("\n").strip() for result in results]
        else:
            people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed"})
        employees = add_others_to_employees(people, employees, location)

    file_name = f'../data/{current_year}/employees_{location}.csv'
    check_dir(file_name)
    employees.to_csv(file_name, index=False)

exception for url: https://www.dhbw-stuttgart.de/dhbw-stuttgart/organisation/rektorat/


In [17]:
path = f'../data/{current_year}' # use your path
all_files = glob.glob(path + "/employees_*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.head()

Unnamed: 0,employee_name,first_matched_job_title,employee_name_clean,location
0,Prof. Dipl.-Ing. Wolf Burger,Prof.* für Lehraufgaben,Wolf Burger,campus-horb
1,"Prof. Dipl.-Ing. Alfred Geisel, M.Sc.",Studiengangsleitung,Alfred Geisel,campus-horb
2,Prof. Dr.-Ing. Joachim Grill,Studiengangsleitung,Joachim Grill,campus-horb
3,Prof. Dr.-Ing. Jürgen Gundrum,Studiengangsleitung,Jürgen Gundrum,campus-horb
4,Dr.-Ing. Jens Häcker,Vertretungsprofessor,Jens Häcker,campus-horb


### test purposes

In [179]:
base_url = "https://www.dhbw-loerrach.de"
test_url = base_url + "/ansprechpersonen/"

r = await session.get("https://www.heilbronn.dhbw.de/ueber-uns/team.html")
await r.html.arender()
    
page_soup = BeautifulSoup(r.html.html, "html.parser")
result_names = page_soup.find_all('div', attrs={"class": "box-0 content-textpic plugin-"})
#result_titles = page_soup.find_all('div', class_="card-extra")

print(result_names[0].get_text().strip())

people = [result.get_text("\n").strip() for result in result_names]

#results = [name.get_text() + "\n\n" + title.get_text("\n") for name, title in zip(result_names[3], result_titles[3])]

#print(results)

splitted_person = people[0].strip().split("\n")
print(splitted_person)
for job_title in splitted_person[2:]:
    job_title_formatted = job_title.strip().lower()
    if any(x in job_title_formatted for x in job_title_match) | (any(x in splitted_person[0].lower() for x in name_title_match)):
        print(job_title.strip())
        break    
                

#.find('div').contents[0].strip())
#print( results[0].find(attrs={"class":"lastname-special"}).string)
#print(results[0].find(attrs={"class":"description"}).string.strip())
#print(results)


Prof. Dr. Nicole Graf
RektorinRaum: B 3.17
E-Mail Tel 07131 1237-120
['Prof. Dr. Nicole Graf', '', '', 'Rektorin', 'Raum: B 3.17', '', '', 'E-Mail', ' ', 'Tel 07131 1237-120']



In [52]:
r = await session.get("https://www.mosbach.dhbw.de/dhbw-mosbach/who-is-who/")
await r.html.arender()
    
page_soup = BeautifulSoup(r.html.html, "html.parser")
people = page_soup.find_all('a', attrs={"class": "accordion-toggle collapsed", "data-parent":"#accordion-dhbwcontacts-az"})
print(people)

[<a class="accordion-toggle collapsed" data-parent="#accordion-dhbwcontacts-az" data-toggle="collapse" href="/dhbw-ravensburg/ansprechpersonen/atheer-al-tameemi#panel-dhbwcontacts-az-2168">
                                    
    Al-Tameemi, Atheer, Dr.-Eng.
    
        <span class="accordion-subtitle">Akademischer Mitarbeiter</span>
</a>, <a class="accordion-toggle collapsed" data-parent="#accordion-dhbwcontacts-az" data-toggle="collapse" href="/dhbw-ravensburg/ansprechpersonen/franz-joseph-arnold#panel-dhbwcontacts-az-1949">
                                    
    Arnold, Franz-Joseph, M. Sc.
    
        <span class="accordion-subtitle">Laboringenieur<br/>
Elektrotechnik</span>
</a>, <a class="accordion-toggle collapsed" data-parent="#accordion-dhbwcontacts-az" data-toggle="collapse" href="/dhbw-ravensburg/ansprechpersonen/thomas-asche#panel-dhbwcontacts-az-348">
                                    
    Asche, Thomas, Prof. Dr.
    
        <span class="accordion-subtitle">Studie

In [None]:
employees = pd.DataFrame(columns=["employee", "first_matched_job_title"])
for result in results:
    s = result.find('a', href=True)
    
    if s['href'].startswith(base_url):
        person_url = s['href']
    else:
        person_url = base_url + s['href']
    print(person_url)
    
    
    soup = get_page_of_url(person_url)
    
    try:
        person_name = soup.find(attrs={"itemprop":"name"}).string
        person_job_title = soup.find(attrs={"itemprop":"jobTitle"}).string
    except:
        print("exception")
        continue
    
    if any(x in person_job_title.lower() for x in job_title_match):        
        new_employee = {"employee_name" : person_name, "first_matched_job_title" : person_job_title.strip()}
        employees = employees.append(new_employee, ignore_index=True)
    
employees.head()
employees.to_csv(f'../data/{current_year}/employees_{location}.csv', index=False)

In [None]:
employees.head()

In [None]:
job_title_match = ["akademisch", "professor", "studiengangsleiter", "wisschenschaftl"]

employees = pd.DataFrame(columns=["employee", "first_matched_job_title"])

for result in results:
    s = result.get_text().strip().split("\n")
    print(s)
    for item in s[2:]:
        item_formatted = item.strip().lower()
        if item != None and any(x in item_formatted for x in job_title_match):
            employees = employees.append({"employee" : s[0], "first_matched_job_title" : item.strip()}, ignore_index=True)
            break

employees.head()