In [37]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [38]:
def login_stepstone(driver, email, password):
    driver.get("https://www.stepstone.de/de-DE/candidate/login")
    wait = WebDriverWait(driver, 10)

    try:
        # Cookie-Consent akzeptieren
        accept_button = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, ".privacy-prompt-button.primary-button.ccmgt_accept_button.rebrand")
            )
        )
        accept_button.click()
        print("Cookie-Consent akzeptiert.")
        time.sleep(1)
    except Exception:
        print("Cookie-Consent Button nicht gefunden oder schon akzeptiert.")

    # E-Mail und Passwort eintragen
    email_input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "gp-x82nov")))
    password_input = driver.find_element(By.CLASS_NAME, "gp-1yn6g5s")

    email_input.clear()
    email_input.send_keys(email)
    password_input.clear()
    password_input.send_keys(password)

    # Login-Button klicken
    login_button = driver.find_element(By.CLASS_NAME, "gp-1ijlosx")
    login_button.click()

    # Warten bis Profil-Link sichtbar (Login erfolgreich)
    print("Login erfolgreich!")

In [39]:
def parse_salary(clean_text):
    # Vorverarbeitung: z.B. Klammern entfernen
    clean_text = clean_text.replace('\xa0', ' ').strip()
    clean_text = re.sub(r"\(.*?\)", "", clean_text).strip()

    pattern = r"""
        (?:ab\s*)?
        (\d{1,3}(?:\.?\d{3})*(?:,\d{1,2})?)
        [\s\u00a0]*€[\s\u00a0]*
        (?:[-–—]\s*
        (\d{1,3}(?:\.?\d{3})*(?:,\d{1,2})?)\s*€\s*)?
        (monatlich|jährlich)?
    """

    match = re.search(pattern, clean_text, re.IGNORECASE | re.VERBOSE)
    if not match:
        return "keine Angabe"

    try:
        def parse_euro_number(num_str):
            num_str = num_str.replace('.', '').replace(',', '.')
            return float(num_str)

        lower = parse_euro_number(match.group(1))
        upper = parse_euro_number(match.group(2)) if match.group(2) else lower
        avg_salary = (lower + upper) / 2

        zeitraum = match.group(3).lower() if match.group(3) else "jährlich"

        if zeitraum == "monatlich":
            avg_salary *= 14  # Jahresgehalt mit 14 Monatsgehältern

        avg_salary = round(avg_salary)

        #if avg_salary < 15000:  # Plausibilitätscheck
         #   return "keine Angabe"

        return str(avg_salary)
    except:
        return "keine Angabe"

In [40]:
def slugify(text: str) -> str:
    """Wandelt Text in URL-Slug um: lowercase, Leerzeichen→-, Umlaute→ ae/oe/ue, ß→ss"""
    slug = text.replace(' ', '-')
    return slug

In [41]:
def scrape_jobs_stepstone(
        driver,
        query: str = "data analyst",
        location: str = "Berlin",
        radius: int = 30,
        max_pages: int = 1
):
    jobs = []
    wait = WebDriverWait(driver, 10)
    job_slug = slugify(query)
    loc_slug = location

    # Basis-URL mit Platzhaltern für Query-Slug, Location-Slug, Radius und Seite
    base_url = (
        "https://www.stepstone.de/jobs/{job_slug}/"
        "in-{loc_slug}?radius={radius}"
        "&searchOrigin=Resultlist_top-search&page={page}"
    )

    for page in range(1, max_pages + 1):
        url = base_url.format(
            job_slug=job_slug,
            loc_slug=loc_slug,
            radius=radius,
            page=page
        )
        print(f"[Seite {page}] öffne: {url}")
        driver.get(url)

        # Auf Laden der Job-Karten warten
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.res-aa3b6p")))
        time.sleep(1)

        cards = driver.find_elements(By.CSS_SELECTOR, "div.res-aa3b6p")

        for idx, card in enumerate(cards, start=1):
            # Job-Titel
            try:
                title = card.find_element(By.CSS_SELECTOR, "div.res-ewgtgq").text.strip()
            except:
                title = "Unbekannt"

            # Gehalt
            try:
                span = card.find_element(
                    By.CSS_SELECTOR,
                    "div.res-5zx6ot span.res-1bl90s9"
                )
                salary = parse_salary(span.text)
            except:
                salary = "keine Angabe"

            # Stadt / Ort
            try:
                city = card.find_element(
                    By.CSS_SELECTOR,
                    "div.res-12jlzgf span.res-du9bhi"
                ).text.strip()
            except:
                city = "Unbekannt"

            jobs.append({
                "title":  title,
                "salary": salary,
                "city":   city
            })

        print(f"-> Seite {page} fertig, insgesamt {len(jobs)} Jobs.")

    return jobs

In [42]:
options = Options()
options.headless = False
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=options)
email = "wi23b056@technikum-wien.at"
password = "Basinga321!"

# 1) Login
login_stepstone(driver, email, password)
time.sleep(5) # Wait for Login
# 2) Scrape
jobs_berlin = scrape_jobs_stepstone(driver, query="Data Analyst", location="Berlin", radius=30, max_pages=5)
jobs_hamburg = scrape_jobs_stepstone(driver, query="Data Analyst", location="Hamburg", radius=30, max_pages=5)
jobs_muenchen = scrape_jobs_stepstone(driver, query="Data Analyst", location="München", radius=30, max_pages=5)
jobs_koeln = scrape_jobs_stepstone(driver, query="Data Analyst", location="Köln", radius=30, max_pages=5)
jobs_frankfurt = scrape_jobs_stepstone(driver, query="Data Analyst", location="Frankfurt", radius=30, max_pages=5)

Cookie-Consent akzeptiert.
Login erfolgreich!
[Seite 1] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Berlin?radius=30&searchOrigin=Resultlist_top-search&page=1
-> Seite 1 fertig, insgesamt 25 Jobs.
[Seite 2] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Berlin?radius=30&searchOrigin=Resultlist_top-search&page=2
-> Seite 2 fertig, insgesamt 50 Jobs.
[Seite 3] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Berlin?radius=30&searchOrigin=Resultlist_top-search&page=3
-> Seite 3 fertig, insgesamt 75 Jobs.
[Seite 4] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Berlin?radius=30&searchOrigin=Resultlist_top-search&page=4
-> Seite 4 fertig, insgesamt 100 Jobs.
[Seite 5] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Berlin?radius=30&searchOrigin=Resultlist_top-search&page=5
-> Seite 5 fertig, insgesamt 125 Jobs.
[Seite 1] öffne: https://www.stepstone.de/jobs/Data-Analyst/in-Hamburg?radius=30&searchOrigin=Resultlist_top-search&page=1
-> Seite 1 fertig, insgesamt 

In [43]:
jobs = jobs_berlin + jobs_hamburg + jobs_muenchen + jobs_koeln + jobs_frankfurt
df = pd.DataFrame(jobs)
df.head(20)
df.to_csv("../output/data_analyst_germany_stepstone.csv", index=False, encoding="utf-8-sig")

In [44]:
"""
options = Options()
email = "wi23b056@technikum-wien.at"
password = "Basinga321!"
options.add_argument('--headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=options)
base_url = "https://www.stepstone.de/jobs/data-analyst?page={page}&searchOrigin=membersarea"
login_stepstone(driver, email, password)
time.sleep(3)
url = base_url.format(page=1)
print(f"Scraping URL: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
print(f"HTML Code: {soup.prettify()}")
"""

'\noptions = Options()\nemail = "wi23b056@technikum-wien.at"\npassword = "Basinga321!"\noptions.add_argument(\'--headless\')\nservice = Service(GeckoDriverManager().install())\ndriver = webdriver.Firefox(service=service, options=options)\nbase_url = "https://www.stepstone.de/jobs/data-analyst?page={page}&searchOrigin=membersarea"\nlogin_stepstone(driver, email, password)\ntime.sleep(3)\nurl = base_url.format(page=1)\nprint(f"Scraping URL: {url}")\ndriver.get(url)\nsoup = BeautifulSoup(driver.page_source, "html.parser")\nprint(f"HTML Code: {soup.prettify()}")\n'