Webscraping mit Pyhon: https://realpython.com/beautiful-soup-web-scraper-python/#scrape-the-fake-python-job-site

In [251]:
import requests
import re
from bs4 import BeautifulSoup
from dataclasses import dataclass

@dataclass
class Person:
    name: str
    title: str = ""
    university: str = ""
    department: str = ""

persons = []

UHH

In [252]:
uhh_url = "https://www2.informatik.uni-hamburg.de/fiona/pers.php"

page = requests.get(uhh_url)
soup = BeautifulSoup(page.content, "html.parser")

result = []
# get all <tr> elements (here: rows of the person table)
for person in soup.find_all(name="tr"):
    # only <tr> elements describing actual persons have an id attribute
    if not person.has_attr("id"):
        continue

    # all table field values are insde <a> tags
    fields = person.findAll("a")

    name = fields[1].text
    department = fields[2].text

    result.append(Person(
        name=name,
        university="Universität Hamburg",
        department=department
    ))

print(f"done: scraped {len(result)} UHH staff")
persons.extend(result)


done: scraped 325 UHH staff


HAW

In [253]:
haw_url = "https://www.haw-hamburg.de/hochschule/technik-und-informatik/departments/informatik/unser-department/beschaeftigte/"

page = requests.get(haw_url)
soup = BeautifulSoup(page.content, "html.parser")

result = []

for row in soup.findAll("div", class_="row person-tile filter_tile border-top pt-4"):
    person = row.find("div", class_="col-12")

    name = ""

    for tag in person.findAll("a", {"title" : "Zur Profilseite"}):
        for part in tag.findAll("b"):
            name += " " + re.sub(r"\s+", " ", part.text.strip())
        result.append(Person(
            name=name.strip(),
            university="HAW Hamburg"
        ))

print(f"done: scraped {len(result)} HAW staff")
persons.extend(result)

    

done: scraped 67 HAW staff


split name into title and name

for instance: 
`"Prof. Dr. Klaus-Peter Kossakowski"` becomes `"Prof. Dr.", "Klaus-Peter Kossakowski"`

In [254]:
for person in persons:
    # chatGPT kann super so dumme regexe schreiben 
    pattern = r"^(Prof\. Dr\.|Prof\.|Dr\.)(-Ing\.| rer\. nat\.)?(?=.)(?:\s|$)"
    
    title = re.search(pattern, person.name)
    
    if not title: continue

    person.title = title.group().strip()
    person.name = re.sub(pattern, "", person.name).strip()

insert persons into database

In [258]:
import sqlite3

class DatabaseHelper:
    def __init__(self) -> None:
        self.TABLE_NAME = "person"
        self.DB_NAME = "../database/main.sqlite"
        self.connection = sqlite3.connect(self.DB_NAME)
        self.cursor = self.connection.cursor()

    def commit(self):
        self.connection.commit()
     
    def close(self):
        self.connection.close()

    def createTable(self):
        self.cursor.execute(
            f'''
            CREATE TABLE IF NOT EXISTS {self.TABLE_NAME} (
                personId INTEGER PRIMARY KEY ASC, 
                name TEXT, 
                title TEXT, 
                university TEXT,
                department TEXT
            );
            '''
        )

    def insertPerson(self, person: Person):
        insert = self.cursor.execute(
            f''' 
            INSERT INTO {self.TABLE_NAME} (name, title, university, department)
            VALUES ("{person.name}", "{person.title}", "{person.university}", "{person.department}");
            '''
        )
        return insert
    
    def insertPersons(self, persons: [Person]):
        for person in persons:
            self.insertPerson(person)
        print(f"inserted {len(persons)} rows into table {self.TABLE_NAME}")

    def selectAll(self): 
        return self.cursor.execute(f'''SELECT * FROM {self.TABLE_NAME};''').fetchall()

    def deleteTable(self):
        return self.cursor.execute(f'DROP TABLE IF EXISTS {self.TABLE_NAME}')

dbHelper = DatabaseHelper()

dbHelper.deleteTable()
dbHelper.createTable()

dbHelper.insertPersons(persons)

dbHelper.commit()

inserted 392 rows into person


In [259]:
for row in dbHelper.selectAll():
    print(row)

(1, 'Fares Abawi', '', 'Universität Hamburg', 'WTM')
(2, 'Rana Abdullah', '', 'Universität Hamburg', 'SEMS')
(3, 'Daniel Ahlers', '', 'Universität Hamburg', 'TAMS')
(4, 'Kyra Ahrens', '', 'Universität Hamburg', 'WTM')
(5, 'Hassan Ali', '', 'Universität Hamburg', 'WTM')
(6, 'Philipp Allgeuer', 'Dr.', 'Universität Hamburg', 'WTM')
(7, 'Selenia Anastasi', '', 'Universität Hamburg', 'LT')
(8, 'Jakob Andersen', '', 'Universität Hamburg', 'MAST')
(9, 'Johanna Ansohn Mcdougall', '', 'Universität Hamburg', 'SVS')
(10, 'Saba Anwar', '', 'Universität Hamburg', 'LT')
(11, 'Oscar Javier Ariza Nunez', 'Dr.', 'Universität Hamburg', 'HCI')
(12, 'Anne Awizen', '', 'Universität Hamburg', 'DBIS')
(13, 'Abinew Ali Ayele', '', 'Universität Hamburg', 'LT')
(14, 'Christian Bähnisch', '', 'Universität Hamburg', 'HITeC')
(15, 'Debayan Banerjee', '', 'Universität Hamburg', 'LT')
(16, 'Jan Baumbach', 'Prof. Dr.', 'Universität Hamburg', 'CSB')
(17, 'Dennis Becker', '', 'Universität Hamburg', 'WTM')
(18, 'Andreas