# Libraries

In [195]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from datetime import datetime, timedelta
import pandas as pd
import time

# Functions (load them first)

In [196]:
def extract_and_format_date(date_str):
    try:
        # Extract the date portion
        date_part = date_str.split(",")[1].strip().split("\n")[0]  # "23.11.24"
        
        # Convert to datetime object
        parsed_date = datetime.strptime(date_part, "%d.%m.%y")
        
        # Format as YYYY-MM-DD
        return parsed_date.strftime("%Y-%m-%d")
    except Exception as e:
        print(f"Error processing date: {date_str} -> {e}")
        return None  # Return None for invalid dates
    
def format_time(time_str):

    if pd.isna(time_str) != True:
        if "Uhr" in time_str:
            return time_str.replace(" Uhr", "")
    else:
        return None

def format_location(location, city):
    # Remove 'pin' and strip any extra spaces
    location = location.replace("pin", "").strip()
    # Append the city name
    formatted_location = f"{location}, {city}"
    return formatted_location

def update_description(row):
    """
    Update the 'Description' column by appending additional information
    while filtering out 'nan' or 'N/A' values.
    """
    additional_info = ""
    if pd.notna(row['Category']):
        additional_info += f"{row['Category']}"
    if pd.notna(row['Subtitle']):
        additional_info += f"\n{row['Subtitle']}"
    if pd.notna(row['Location Link']):
        additional_info += f"\nLocation Link: {row['Location Link']}"
    if pd.notna(row['Event Details Link']):
        additional_info += f"\nEvent Details Link: {row['Event Details Link']}"
    return str(row['Description']) + additional_info
    # Return an empty string or handle missing values appropriately

def preprocessing(df, city_name):

    # Column changes
    df['Subject'] = df['Title']
    df['Start_date'] = df['Date']
    df['Start_time'] = df['Time'].apply(format_time)
    df['End_time'] = None
    df['Location'] = df['Location'].str.replace("pin", "").str.strip() + f", {city}"
    df['Description'] = ""
    df['Description'] = df.apply(update_description, axis=1)

    # Drop unnecessary columns
    df.drop(columns=['Date','City','Category','Title','Subtitle','Time','Location Link','Event Details Link'],inplace=True)

    return df

# Web scraper
* Add the cities

In [199]:
# Initialize WebDriver
driver = webdriver.Chrome()

# Open the target website
url = "https://www.wasgehtapp.de/index.php?geo_id=22995&ort=Rendsburg&x=9.66986&y=54.3038&select_ort=1&radius=20&region=10"
driver.get(url)

# List of cities to scrape
cities = ["Kiel"]

# Today's date
today = datetime.today()

# Initialize data storage
all_data = {}

# Loop through each city
for city in cities:
    city_data = []  # List to store data for this city

    # Select city input field and search for city
    try:
        city_input_button = driver.find_element(By.CSS_SELECTOR, "#select_ort")
        city_input_button.click()

        search_input = driver.find_element(By.CSS_SELECTOR, "#select_ort_input")
        search_input.send_keys(city)
        search_input.send_keys(Keys.RETURN)
        time.sleep(3)  # Wait for the page to reload
    except Exception as e:
        print(f"Error setting city {city} on {date_str}: {e}")
        continue

    # Loop through the next 10 days
    for day_offset in range(10):
        # Generate the URL for the specific day
        target_date = today + timedelta(days=day_offset)
        date_str = target_date.strftime("%Y-%m-%d")
        url = f"https://www.wasgehtapp.de/index.php?date={date_str}"
        driver.get(url)

        # Wait for the page to load
        time.sleep(5)

        # Locate all "katcontainer" containers
        try:
            containers = driver.find_elements(By.CSS_SELECTOR, ".katcontainer")
            # Exclude containers with the class "vorschau" or kat="kino"
            filtered_containers = [
                container for container in containers
                if "vorschau" not in container.get_attribute("class") and container.get_attribute("kat") != "kino"
                ]       
        except Exception as e:
            print(f"Error fetching containers for city {city} on {date_str}: {e}")
            continue

        # Extract events data
        for container in filtered_containers:
            try:
                category = container.get_attribute("kat")  # Get the "kat" attribute directly
            except:
                category = None

            # Set Date as Target Date
            date = date_str

            # Find all events (termin) within this container
            events = container.find_elements(By.CSS_SELECTOR, ".termin")
            for event in events:
                try:
                    title_element = event.find_element(By.CSS_SELECTOR, "h3.titel > a")
                    title = title_element.text.strip()
                    event_details_link = title_element.get_attribute("href")
                except:
                    title = None
                    event_details_link = None

                try:
                    subtitle = event.find_element(By.CSS_SELECTOR, ".subtitel").text.strip()
                except:
                    subtitle = None

                try:
                    time_start = event.find_element(By.CSS_SELECTOR, ".zeitloc > span.zeit").text.strip()
                except:
                    time_start = None
                try:
                    location_element = event.find_elements(By.CSS_SELECTOR, ".zeitloc > a")
                    if len(location_element) > 0:
                        location = location_element[0].text.strip()
                    else:
                        location = None

                    if len(location_element) > 1:
                        location_link = location_element[1].get_attribute("href")
                    else:
                        location_link = None
                except:
                    location = None
                    location_link = None

                # Append event data to city_data
                city_data.append({
                    "Date": date,
                    "City": city,
                    "Category": category,
                    "Title": title,
                    "Subtitle": subtitle,
                    "Time": time_start,
                    "Location": location,
                    "Location Link": location_link,
                    "Event Details Link": event_details_link
                })

    # Save data for this city to a CSV file
    city_df = pd.DataFrame(city_data)

    preprocessing(city_df, city)
    
    desired_columns = ['Subject', 'Start_date', 'Start_time', 'End_time', 'Location', 'Description']
    city_df = city_df.reindex(columns=desired_columns)

    city_df.to_csv(f"{city}_wasgeht.csv", index=False, encoding="utf-8")
    print(f"Data for {city} saved to {city}_wasgeht.csv")

# Close the browser
driver.quit()


Data for Kiel saved to Kiel_wasgeht.csv


In [200]:
city_df


Unnamed: 0,Subject,Start_date,Start_time,End_time,Location,Description
0,brunch & brew,2024-11-23,15:00,,"COBL, Kiel",konzert\n8€\nLocation Link: http://www.cobl.op...
1,Oud Konzert mit Nedal Aldaiekh,2024-11-23,18:00,,"Café Jupiter, Kiel",konzert\nLocation Link: https://www.instagram....
2,JAM SESSION am Samstag - Genresessions,2024-11-23,18:00,,"Siebeneck & Triangel, Kiel","konzert\ntags pop, stoner, psychedelic, rock, ..."
3,LIVE! - BIG DADDY WILSON & THE GOOSEBUMPS BROS.,2024-11-23,19:00,,"Räucherei, Kiel","konzert\ntags folk, soul\nLocation Link: https..."
4,"Rampampam, Oistress, Herr Fugbaum",2024-11-23,19:00,,"Alte Meierei, Kiel",konzert\ntags punk\nLocation Link: https://www...
...,...,...,...,...,...,...
278,Kiel festhalten. Stadtansichten von Gretel Rie...,2024-12-02,11:30,,"Stadtmuseum Warleberger Hof, Kiel",vortrag\ntags (teilw.) eintritt frei\nLocation...
279,Pub Quiz,2024-12-02,20:00,,"Pogue Mahone, Kiel",sonstige\nLocation Link: https://poguemahone.d...
280,Kieler Weihnachtsmarkt,2024-12-02,,,"Innenstadt, Kiel",sonstige\nLocation Link: https://www.kiel.de/\...
281,Stadtwerke Eisfestival,2024-12-02,,,"Germania Hafen, Kiel",sonstige\nEvent Details Link: https://www.wasg...
