In [70]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [71]:
def login_stepstone(driver, email, password):
    driver.get("https://www.stepstone.de/de-DE/candidate/login")
    wait = WebDriverWait(driver, 10)

    try:
        # Cookie-Consent akzeptieren
        accept_button = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, ".privacy-prompt-button.primary-button.ccmgt_accept_button.rebrand")
            )
        )
        accept_button.click()
        print("Cookie-Consent akzeptiert.")
        time.sleep(1)
    except Exception:
        print("Cookie-Consent Button nicht gefunden oder schon akzeptiert.")

    # E-Mail und Passwort eintragen
    email_input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "gp-x82nov")))
    password_input = driver.find_element(By.CLASS_NAME, "gp-1yn6g5s")

    email_input.clear()
    email_input.send_keys(email)
    password_input.clear()
    password_input.send_keys(password)

    # Login-Button klicken
    login_button = driver.find_element(By.CLASS_NAME, "gp-1ijlosx")
    login_button.click()

    # Warten bis Profil-Link sichtbar (Login erfolgreich)
    print("Login erfolgreich!")

In [72]:
def parse_salary(clean_text):
    # Vorverarbeitung: z.B. Klammern entfernen
    clean_text = clean_text.replace('\xa0', ' ').strip()
    clean_text = re.sub(r"\(.*?\)", "", clean_text).strip()

    print(f"Verarbeite: '{clean_text}'")

    pattern = r"""
        (?:ab\s*)?
        (\d{1,3}(?:\.?\d{3})*(?:,\d{1,2})?)
        [\s\u00a0]*€[\s\u00a0]*
        (?:[-–—]\s*
        (\d{1,3}(?:\.?\d{3})*(?:,\d{1,2})?)\s*€\s*)?
        (monatlich|jährlich)?
    """

    match = re.search(pattern, clean_text, re.IGNORECASE | re.VERBOSE)
    if not match:
        return "keine Angabe"

    try:
        def parse_euro_number(num_str):
            num_str = num_str.replace('.', '').replace(',', '.')
            return float(num_str)

        lower = parse_euro_number(match.group(1))
        upper = parse_euro_number(match.group(2)) if match.group(2) else lower
        avg_salary = (lower + upper) / 2

        zeitraum = match.group(3).lower() if match.group(3) else "jährlich"

        if zeitraum == "monatlich":
            avg_salary *= 14  # Jahresgehalt mit 14 Monatsgehältern

        avg_salary = round(avg_salary)

        #if avg_salary < 15000:  # Plausibilitätscheck
         #   return "keine Angabe"

        return str(avg_salary)
    except:
        return "keine Angabe"

In [73]:
options = Options()
options.headless = False
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=options)
email = "wi23b056@technikum-wien.at"
password = "Basinga321!"

# 1) Login
login_stepstone(driver, email, password)
time.sleep(5)

def scrape_jobs_stepstone(driver, max_pages=1):
    jobs = []
    wait = WebDriverWait(driver, 10)
    base_url = "https://www.stepstone.de/jobs/data-analyst?page={page}"

    for page in range(1, max_pages + 1):
        driver.get(base_url.format(page=page))
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.res-aa3b6p")))
        time.sleep(1)
        cards = driver.find_elements(By.CSS_SELECTOR, "div.res-aa3b6p")
        driver.save_screenshot(f"page_{page}.png")

        for idx, card in enumerate(cards, 1):
            # Debug HTML
            print(f"\n--- Seite {page}, Karte {idx} ---")
            print(card.get_attribute("outerHTML")[:300], "…")

            # Titel
            try:
                title = card.find_element(By.CSS_SELECTOR, "div.res-ewgtgq").text
            except:
                title = "Unbekannt"

            # Gehalt
            try:
                span = card.find_element(By.CSS_SELECTOR, "div.res-5zx6ot span.res-1bl90s9")
                print("Gehalt-Raw:", span.text)
                salary = parse_salary(span.text)
            except:
                salary = "keine Angabe"

            try:
                city = card.find_element(
                    By.CSS_SELECTOR,
                    "div.res-12jlzgf span.res-du9bhi"
                ).text.strip()
            except:
                city = "Unbekannt"

            jobs.append({
                "title":  title,
                "city":   city,
                "salary": salary
            })

        print(f"-> Seite {page} fertig")
    return jobs

jobs = scrape_jobs_stepstone(driver)

print(jobs)
df = pd.DataFrame(jobs)
df.to_csv("../output/analyst_berlin_stepstone.csv", index=False, encoding="utf-8-sig")


Cookie-Consent akzeptiert.
Login erfolgreich!

--- Seite 1, Karte 1 ---
<div class="res-aa3b6p" data-genesis-element="CARD_CONTENT" data-testid="job-card-content"><div class="res-8xc9wa" data-genesis-element="BASE"><a class="res-k1rmma" data-genesis-element="COMPANY_LOGO_LINK" href="https://www.stepstone.de/cmp/de/niedersachsen-ports-gmbh-%26-co-kg-75104/jobs" data-at=" …
Gehalt-Raw: 55.000 - 78.000 €/Jahr (geschätzt für Vollzeit)
Verarbeite: '55.000 - 78.000 €/Jahr'

--- Seite 1, Karte 2 ---
<div class="res-aa3b6p" data-genesis-element="CARD_CONTENT" data-testid="job-card-content"><div class="res-8xc9wa" data-genesis-element="BASE"><a class="res-k1rmma" data-genesis-element="COMPANY_LOGO_LINK" href="https://www.stepstone.de/cmp/de/stepstone-gmbh-148733/jobs" data-at="company-logo" targe …
Gehalt-Raw: 48.800 - 73.200 €/Jahr (geschätzt für Vollzeit)
Verarbeite: '48.800 - 73.200 €/Jahr'

--- Seite 1, Karte 3 ---
<div class="res-aa3b6p" data-genesis-element="CARD_CONTENT" data-testid="job

In [74]:
"""
options = Options()
email = "wi23b056@technikum-wien.at"
password = "Basinga321!"
options.add_argument('--headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=options)
base_url = "https://www.stepstone.de/jobs/data-analyst?page={page}&searchOrigin=membersarea"
login_stepstone(driver, email, password)
time.sleep(3)
url = base_url.format(page=1)
print(f"Scraping URL: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
print(f"HTML Code: {soup.prettify()}")
"""

'\noptions = Options()\nemail = "wi23b056@technikum-wien.at"\npassword = "Basinga321!"\noptions.add_argument(\'--headless\')\nservice = Service(GeckoDriverManager().install())\ndriver = webdriver.Firefox(service=service, options=options)\nbase_url = "https://www.stepstone.de/jobs/data-analyst?page={page}&searchOrigin=membersarea"\nlogin_stepstone(driver, email, password)\ntime.sleep(3)\nurl = base_url.format(page=1)\nprint(f"Scraping URL: {url}")\ndriver.get(url)\nsoup = BeautifulSoup(driver.page_source, "html.parser")\nprint(f"HTML Code: {soup.prettify()}")\n'