#### Web scraping Neumünster

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pytz         # to set the German time zone.
import time as tm   # be careful, I specifically gave an alias as 'tm', cause I have a variable called 'time' already in my code.

In [None]:
# pip install selenium
# pip install pytz
# or maybe this is needed: locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

In [32]:
# trying to use beautifulsoup instead of just selenium, improved version:
driver = webdriver.Chrome()
driver.get("https://www.neumuenster.de/kultur-freizeit/veranstaltungskalender") 

germany_tz = pytz.timezone('Europe/Berlin')
current_date = datetime.now(germany_tz).date() # Get the current date and time in the German time zone

ten_days_from_today = current_date + timedelta(days=10)

event_list = []  # to store all the events.

is_true = True
while is_true:  
    
    # Wait until the event containers are loaded
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.col-xs-10.col-sm-9.col-md-10')))
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'meta[itemprop="startDate"]')))
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span[itemprop="address"]')))  
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h5.dfx-titel-liste-dreizeilig'))) 
    # After loading, get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Now use BeautifulSoup to extract the events
    events = soup.select('div.col-xs-10.col-sm-9.col-md-10')  # Adjust selector to match event containers

    # Function to extract element's text or attribute using BeautifulSoup
    def get_element_or_none(soup, selector, attribute=None):
        try:
            element = soup.select_one(selector)
            if element:
                if attribute:
                    return element.get(attribute).strip()  
                else:
                    return element.text.strip() 
            return np.nan
        except:
            return np.nan

    # Loop through each event and extract details
    for event in events:

        # Check if the event date is within the next 10 days
        temp_date = event.select_one('meta[itemprop="startDate"]').get('content')
        check_date = datetime.fromisoformat(temp_date).date()

        if ten_days_from_today < check_date:
            is_true = False
            break  # Exit the loop if the event is more than 10 days away

        # Extracting event details
        title = get_element_or_none(event, 'h5.dfx-titel-liste-dreizeilig')
        date = get_element_or_none(event, 'meta[itemprop="startDate"]', attribute='content')
        time = get_element_or_none(event, 'span.dfx-zeit-liste-dreizeilig')
        address = get_element_or_none(event, 'span[itemprop="address"]')
        
        place_name = get_element_or_none(event, 'span[itemprop="name"]')
        full_address = f"{place_name} | {address}"
        source = get_element_or_none(event, 'h5.dfx-titel-liste-dreizeilig a', attribute='href')

        # Creating the event dictionary
        our_event = {
            'Subject': title,
            'Start_date': date,
            'time': time,
            'Location': full_address,
            'Description': source
        }

        event_list.append(our_event)

    if not is_true:
        break

    pagination_block = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ul.pagination li')))
    page_link = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(pagination_block[-1].find_element(By.TAG_NAME, 'a')))
    page_link.click()
    tm.sleep(1)

driver.close()  # Close the Selenium driver

In [33]:
df = pd.DataFrame(event_list)
df

Unnamed: 0,Subject,Start_date,time,Location,Description
0,Nomaden unserer Zeit,2024-10-26T10:00,10:00 Uhr bis 17:00 Uhr,"Museum Tuch + Technik | 24534 Neumünster, Klei...",https://city-nms.de/events/veranstaltungskalen...
1,Kreatives aus Filz – Workshop Trockenfilzen,2024-10-26T10:15,10:15 Uhr bis 11:30 Uhr,"Stadtbücherei Neumünster | 24534 Neumünster, W...",https://city-nms.de/events/veranstaltungskalen...
2,Grundkurs Weben 1,2024-10-26T10:15,| 10:15 Uhr bis 15:15 Uhr,"Museum Tuch + Technik | 24534 Neumünster, Klei...",https://city-nms.de/events/veranstaltungskalen...
3,Gemälde von Gunter Geißler,2024-10-26T11:00,11:00 Uhr bis 17:00 Uhr,KulturWerk Galerie Behrendt | 24534 Neumünster...,https://city-nms.de/events/veranstaltungskalen...
4,Ausstellung der Künstlergruppe KUNST & BÜNDIG:...,2024-10-26T12:00,12:00 Uhr bis 21:00 Uhr,"Hotel Seeblick | 24582 Mühbrook, Dorfstr. 18",https://city-nms.de/events/veranstaltungskalen...
...,...,...,...,...,...
64,Ausstellung der Künstlergruppe KUNST & BÜNDIG:...,2024-11-04T16:30,16:30 Uhr bis 21:00 Uhr,"Hotel Seeblick | 24582 Mühbrook, Dorfstr. 18",https://city-nms.de/events/veranstaltungskalen...
65,Nomaden unserer Zeit,2024-11-05T09:00,09:00 Uhr bis 17:00 Uhr,"Museum Tuch + Technik | 24534 Neumünster, Klei...",https://city-nms.de/events/veranstaltungskalen...
66,Austellung der Künstlergruppe KUNST & BÜNDIG: ...,2024-11-05T12:00,12:00 Uhr bis 21:00 Uhr,"Hotel Seeblick | 24582 Mühbrook, Dorfstr. 18",https://city-nms.de/events/veranstaltungskalen...
67,"NDB Kiel ""Dröög""",2024-11-05T19:30,19:30 Uhr,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      69 non-null     object
 1   Start_date   69 non-null     object
 2   time         67 non-null     object
 3   Location     69 non-null     object
 4   Description  69 non-null     object
dtypes: object(5)
memory usage: 2.8+ KB


In [38]:
# df.to_csv("neumunster_events.csv", index=False)