#### Web scraping Neumünster

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
from datetime import datetime, timedelta
import pytz         # to set the German time zone.
import time as tm   # be careful, I specifically gave an alias as 'tm', cause I have a variable called 'time' already in my code.

In [None]:
# pip install selenium
# pip install pytz
# or maybe this is needed: locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

In [8]:
driver = webdriver.Chrome()
driver.get("https://www.neumuenster.de/kultur-freizeit/veranstaltungskalender")  # this can be deleted but then range should start from '1'.

germany_tz = pytz.timezone('Europe/Berlin')
current_date = datetime.now(germany_tz).date() # Get the current date and time in the German time zone

ten_days_from_today = current_date + timedelta(days=10)

event_list = []  # to store all the events.

should_break = False
for i in range(2, 40):  # basically it's the range of the pages.

    # you can specify the page range even 50 or more. The loop breaks anyway when we scrape events that are 10 days from today.
    url = f'https://www.neumuenster.de/kultur-freizeit/veranstaltungskalender?dfxp={i}'
    try:
        events = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.col-xs-10.col-sm-9.col-md-10')) # there is 10 events on each page.
        )
    except Exception as e:
        print(f"An error occurred one: {e}")

    for event in events:
        # We don't want to scrape events that are more than 10 days away from today.
        try:
            temp_date = event.find_element(By.CSS_SELECTOR, 'meta[itemprop="startDate"]').get_attribute('content')
            check_date = datetime.fromisoformat(temp_date).date()
            # creating the ckeck_date to see if this event's date is 10 days ahead of us or not.
            # if this event's date is 10 older than today's date we stop the loop and break out of the outer loop as well.

            if ten_days_from_today < check_date:
                should_break = True
                break

        except Exception as e:
            print(f"An error occurred two: {e}")

        # function to get the element's text or attribute, returns None if not found.
        def get_element_or_none(event, selector, attribute=None):
            element = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
            if element:
                if attribute:
                    return element.get_attribute(attribute)
                else:
                    return element.text
            return None
        
        try:
            title = get_element_or_none(event, 'h5.dfx-titel-liste-dreizeilig')
            date = get_element_or_none(event, 'meta[itemprop="startDate"]', attribute='content')
            time = get_element_or_none(event, 'span.dfx-zeit-liste-dreizeilig')
            address = get_element_or_none(event, 'span[itemprop="address"]')
            place_name = get_element_or_none(event, 'span[itemprop="name"]')     # just the name of the place
            full_address = place_name + ' | ' + address 

            source_temp = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h5.dfx-titel-liste-dreizeilig')))
            # a temporary variable to find the link.
            source = source_temp.find_element(By.TAG_NAME, 'a').get_attribute('href')  # maybe it should be named a 'description' instead.

            our_event = {
                'event': title,
                'date': date,
                'time': time,
                'address': full_address,
                'source': source}
            event_list.append(our_event)

        except Exception as e:
            print(f"An error occurred three: {e}")

    if should_break:   
        break
    driver.get(url)
    tm.sleep(1)
driver.close()

In [9]:
df = pd.DataFrame(event_list)
df

Unnamed: 0,event,date,time,address,source
0,"Woche der seelischen Gesundheit: Infostand ""In...",2024-09-27T10:30,10:30 Uhr bis 12:00 Uhr,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...
1,Die Spur des Bildhauers. Wiedersehen mit Heinz...,2024-09-27T12:00,12:00 Uhr bis 18:00 Uhr,"Herbert Gerisch-Stiftung | 24536 Neumünster, B...",https://city-nms.de/events/veranstaltungskalen...
2,Gaming Nachmittag auf der Nintendo Switch,2024-09-27T15:00,15:00 Uhr bis 17:00 Uhr,"Stadtbücherei Neumünster | 24534 Neumünster, W...",https://city-nms.de/events/veranstaltungskalen...
3,Gemälde von Lutz Bleidorn,2024-09-27T15:00,15:00 Uhr bis 18:00 Uhr,KulturWerk Galerie Behrendt | 24534 Neumünster...,https://city-nms.de/events/veranstaltungskalen...
4,Schlemmerköste,2024-09-27T16:00,16:00 Uhr bis 20:00 Uhr,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...
...,...,...,...,...,...
65,Kino: Was willder Lama mit dem Gewehr,2024-10-06T20:00,20:00 Uhr,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...
66,Woche der seelischen Gesundheit: Vortrag und S...,2024-10-07T09:00,09:00 Uhr bis 09:30 Uhr,"BiZ, Gruppenraum 111 | 24534 Neumünster, Brach...",https://city-nms.de/events/veranstaltungskalen...
67,"Woche der seelischen Gesundheit: Infostand ""Ru...",2024-10-07T09:00,| 09:00 Uhr bis 15:00 Uhr,"Neues Rathaus | 24534 Neumünster, Großflecken 59",https://city-nms.de/events/veranstaltungskalen...
68,Woche der seelischen Gesundheit: Vortrag und R...,2024-10-07T10:30,10:30 Uhr bis 12:00 Uhr,"Brücke Neumünster gGmbH | 24539 Neumünster, Wr...",https://city-nms.de/events/veranstaltungskalen...


<b> Processing steps </b>

In [10]:
import re

df['date'] = pd.to_datetime(df['date']).dt.date  # convert it in a normal date format.

def func(x):
    if 'bis' in x:
        x = x.replace('bis', '-')   # replacing 'bis' with a '-' hyphen sign 
    x = re.sub(r'[^0-9:-]', '', x)  # keep only digits, colons, and hyphens. Remove the rest/
    return x

df['time'] = df['time'].apply(func)

In [11]:
# the city column can be addded as well.
df.head()

Unnamed: 0,event,date,time,address,source
0,"Woche der seelischen Gesundheit: Infostand ""In...",2024-09-27,10:30-12:00,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...
1,Die Spur des Bildhauers. Wiedersehen mit Heinz...,2024-09-27,12:00-18:00,"Herbert Gerisch-Stiftung | 24536 Neumünster, B...",https://city-nms.de/events/veranstaltungskalen...
2,Gaming Nachmittag auf der Nintendo Switch,2024-09-27,15:00-17:00,"Stadtbücherei Neumünster | 24534 Neumünster, W...",https://city-nms.de/events/veranstaltungskalen...
3,Gemälde von Lutz Bleidorn,2024-09-27,15:00-18:00,KulturWerk Galerie Behrendt | 24534 Neumünster...,https://city-nms.de/events/veranstaltungskalen...
4,Schlemmerköste,2024-09-27,16:00-20:00,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...
