In [2]:
!pip install selenium pandas webdriver-manager

Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.37.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.10.5 (from selenium)
  Downloading certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions<5.0,>=4.15.0 (from selenium)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadat

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-embeddings-instructor 0.4.1 requires sentence-transformers<3,>=2.2.2, but you have sentence-transformers 5.1.0 which is incompatible.
llama-index-readers-file 0.5.4 requires pandas<2.3.0, but you have pandas 2.3.3 which is incompatible.


In [1]:
import os, time, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

URL = "https://www.eventbrite.de/d/germany--berlin/tech--events/"

def main():
    options = webdriver.ChromeOptions()
    # run visible so you can manually click if needed
    # comment the next line if you want headless
    # options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1280,2000")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(URL)
    print("💡 If a cookie consent popup appears, please click 'Accept all cookies' manually.")
    time.sleep(8)  # give you time to accept cookies

    # scroll several times to force event cards to load
    for _ in range(8):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2.5)

    # wait up to 30 s for event cards
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='search-event-card']"))
        )
    except Exception:
        print("⚠️ No event cards detected yet, continuing anyway...")

    cards = driver.find_elements(By.CSS_SELECTOR, "[data-testid='search-event-card']")
    events = []
    for c in cards:
        try:
            name = c.find_element(By.CSS_SELECTOR, "[data-testid='search-event-card__event-title']").text
        except:
            name = ""
        try:
            date = c.find_element(By.CSS_SELECTOR, "[data-spec='event-card__date']").text
        except:
            date = ""
        try:
            loc = c.find_element(By.CSS_SELECTOR, "[data-testid='search-event-card__location']").text
        except:
            loc = ""
        if name:
            events.append({"event_name": name, "date": date, "location": loc})

    driver.quit()

    if not events:
        print("⚠️ Still no events scraped. You may need to accept cookies, scroll manually, then rerun.")
    else:
        df = pd.DataFrame(events)
        out_path = os.path.join(os.getcwd(), "berlin_events.csv")
        df.to_csv(out_path, index=False, encoding="utf-8")
        print(f"✅ Saved {len(df)} events to {out_path}")
        print(df.head())

if __name__ == "__main__":
    main()


💡 If a cookie consent popup appears, please click 'Accept all cookies' manually.
⚠️ No event cards detected yet, continuing anyway...
⚠️ Still no events scraped. You may need to accept cookies, scroll manually, then rerun.


In [3]:
!pip install pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import os, io, re
import pandas as pd
import requests

# --- 1) Download the official Excel of Berlin coworking spaces ---
# Source page (has the "machine-readable Excel file" link):
# EN: https://projektzukunft.berlin.de/en/projekt-zukunft/services/overview-coworking-spaces
# DE: https://projektzukunft.berlin.de/projekt-zukunft/services/coworking-spaces
# (The resource on Berlin Open Data points to the same file.) 

EXCEL_URLS = [
    # Primary (from the site). If one 400s in your environment, the other may work in browser-first sessions.
    "https://projektzukunft.berlin.de/fileadmin/user_upload/pdf/Services/Coworking_Spaces_Feb2020.xlsx",
]

def fetch_excel(urls):
    last_err = None
    for url in urls:
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            return r.content
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Could not download Excel from Projekt Zukunft. Last error: {last_err}")

try:
    content = fetch_excel(EXCEL_URLS)
except Exception as e:
    # If direct download is blocked by the site/CDN in your environment, you can manually download the file
    # from the page and place it next to this script as 'Coworking_Spaces.xlsx', then rerun.
    # This keeps you moving today without brittle scraping.
    print("Direct download failed. If it does for you too, do this quick fallback:")
    print("1) Open the page (EN or DE link above).")
    print("2) Click the 'machine-readable Excel file' link.")
    print("3) Save as 'Coworking_Spaces.xlsx' next to this script and rerun.")
    raise

# Save the downloaded Excel in your current folder
excel_path = os.path.join(os.getcwd(), "Coworking_Spaces.xlsx")
with open(excel_path, "wb") as f:
    f.write(content)
print(f"✅ Downloaded Excel to: {excel_path}")

# --- 2) Load & normalize columns ---
df = pd.read_excel(io.BytesIO(content))  # read directly
# If you manually saved, use: df = pd.read_excel("Coworking_Spaces.xlsx")

# Normalize column names
df.columns = [str(c).strip() for c in df.columns]

# Try to detect typical fields present in the file
# We’ll map them to: name, address, website, district (kiez_guess)
name_col = next((c for c in df.columns if re.search(r"name|space|anbieter|cowork", c, re.I)), None)
addr_col = next((c for c in df.columns if re.search(r"adresse|address|anschrift|straße|strasse", c, re.I)), None)
web_col  = next((c for c in df.columns if re.search(r"web|url|link|seite", c, re.I)), None)
desc_col = next((c for c in df.columns if re.search(r"beschreibung|description", c, re.I)), None)

for must, label in [(name_col,"name"), (addr_col,"address")]:
    if must is None:
        print("⚠️ Column detection: could not auto-detect a", label, "column. The file format may have changed.")
        print("Open the Excel to see exact headers and adjust the regex above if needed.")

work = pd.DataFrame({
    "event_name": df[name_col] if name_col else "",
    "location": df[addr_col] if addr_col else "",
    "event_url": df[web_col] if web_col else "",
    "description": df[desc_col] if desc_col else "",
})

# --- 3) Guess district/Kiez from address (heuristic) ---
mapping = {
    "mitte": "Mitte",
    "kreuzberg": "Kreuzberg",
    "friedrichshain": "Friedrichshain",
    "neukölln": "Neukölln",
    "charlottenburg": "Charlottenburg",
    "prenzlauer berg": "Prenzlauer Berg",
    "schöneberg": "Schöneberg",
    "moabit": "Moabit",
    "wedding": "Wedding",
    "tempelhof": "Tempelhof",
    "lichtenberg": "Lichtenberg",
    "tiergarten": "Tiergarten",
    "steglitz": "Steglitz",
    "zehlendorf": "Zehlendorf",
    "treptow": "Treptow",
    "köpenick": "Köpenick",
    "marzahn": "Marzahn",
    "hellersdorf": "Hellersdorf",
    "spandau": "Spandau",
    "pankow": "Pankow",
    "dahlem": "Dahlem",
}
def kiez_guess(addr):
    if not isinstance(addr, str):
        return ""
    low = addr.lower()
    for key, val in mapping.items():
        if key in low:
            return val
    return ""

work["kiez"] = work["location"].apply(kiez_guess)

# Add a simple 'type' field so it matches your prototype narrative
work["type"] = "coworking_space"

# --- 4) Save CSVs you can use immediately in your analysis & slides ---
events_csv = os.path.join(os.getcwd(), "berlin_events.csv")
by_kiez_csv = os.path.join(os.getcwd(), "events_by_kiez.csv")

work.to_csv(events_csv, index=False, encoding="utf-8")
counts = work["kiez"].replace("", "Unbekannt").value_counts()
counts.to_csv(by_kiez_csv, header=["count"], encoding="utf-8")

print(f"✅ Saved {len(work)} rows to: {events_csv}")
print(f"✅ Saved counts by Kiez to: {by_kiez_csv}")
print(work.head(10).to_string(index=False))


✅ Downloaded Excel to: c:\Users\krupa\Desktop\Bootcamp\project_keiz_connect\Coworking_Spaces.xlsx
✅ Saved 116 rows to: c:\Users\krupa\Desktop\Bootcamp\project_keiz_connect\berlin_events.csv
✅ Saved counts by Kiez to: c:\Users\krupa\Desktop\Bootcamp\project_keiz_connect\events_by_kiez.csv
          event_name                              location                                            event_url description kiez            type
          AHA Berlin       Bergmannstraße 68, 10961 Berlin https://www.facebook.com/AHA-Berlin-427566990689415/                  coworking_space
         Ahoy Berlin            Wattstraße 1, 13355 Berlin                           http://www.ahoyberlin.com/                  coworking_space
Alte Kantine Wedding            Uferstr. 8-1, 13357 Berlin            http://alte-kantine-wedding.de/coworking/                  coworking_space
   AMAPOLA COWORKING    Scharnhorststraße 24, 10115 Berlin                             www.amapola-coworking.de                  co