#### Web scraping Neumünster

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
from datetime import datetime, timedelta
import pytz         # to set the German time zone.
import time as tm   # be careful, I specifically gave an alias as 'tm', cause I have a variable called 'time' already in my code.

In [23]:
# pip install selenium
# pip install pytz
# or maybe this is needed: locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

In [15]:
driver = webdriver.Chrome()
driver.get("https://www.neumuenster.de/kultur-freizeit/veranstaltungskalender")

germany_tz = pytz.timezone('Europe/Berlin')
current_date = datetime.now(germany_tz).date() # Get the current date and time in the German time zone

ten_days_from_today = current_date + timedelta(days=10)

event_list = []  # to store all the events.

should_break = False
for i in range(2, 40):   # basically it's the range of the pages.

    # you can specify the page range even 50 or more. The loop breaks anyway when we scrape events that are 10 days from today.
    url = f'https://www.neumuenster.de/kultur-freizeit/veranstaltungskalender?dfxp={i}'
    try:
        events = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.col-xs-10.col-sm-9.col-md-10')) # there is 10 events on each page.
        )
    except Exception as e:
        print(f"An error occurred one: {e}")

    for event in events:
        # We don't want to scrape events that are more than 10 days away from today.
        try:
            temp_date = event.find_element(By.CSS_SELECTOR, 'meta[itemprop="startDate"]').get_attribute('content')
            check_date = datetime.fromisoformat(temp_date).date()
            # creating the ckeck_date to see if this event's date is 10 days ahead of us or not.
            # if this event's date is 10 older than today's date we stop the loop and break out of the outer loop as well.

            if ten_days_from_today < check_date:
                should_break = True
                break

        except Exception as e:
            print(f"An error occurred two: {e}")
        
        try:
            title = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h5.dfx-titel-liste-dreizeilig'))
                                                  ).text
            date = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'meta[itemprop="startDate"]'))
                                                  ).get_attribute('content')
            time = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.dfx-zeit-liste-dreizeilig'))
                                                  ).text
            address = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span[itemprop="address"]'))
                                                  ).text
            place_name = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span[itemprop="name"]'))
                                                  ).text     # just the name of the place
            full_address = place_name + ' | ' + address 

            source_temp = WebDriverWait(event, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h5.dfx-titel-liste-dreizeilig'))) # a temporary variable to find the link.
            source = WebDriverWait(source_temp, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'a'))
                                                  ).get_attribute('href')   # maybe it should be named a 'descreption' instead.
            our_event = {
                'event': title,
                'date': date,
                'time': time,
                'address': full_address,
                'source': source}
            event_list.append(our_event)

        except Exception as e:                      # Some try and except conditions are useless in my opinion.
            print(f"An error occurred three: {e}")
            
    if should_break:   # not really necessary but just to make the code better.
        break

    driver.get(url)  # moving to the next page and the cycle repeats.
    tm.sleep(1)

driver.close()

In [37]:
df = pd.DataFrame(event_list)
df

Unnamed: 0,event,date,time,address,source
0,KUNSTFLECKEN - Aggregat,2024-09-23T19:00,19:00 Uhr,"Werkhalle | 24534 Neumünster, Klosterstraße 16",https://city-nms.de/events/veranstaltungskalen...
1,"Stark wie Leo, Selbstbehauptungs-und Resilienz...",2024-09-24T14:00,14:00 Uhr bis 15:00 Uhr,"Stark wie Leo | 24536 Neumünster, Roschdohler ...",https://city-nms.de/events/veranstaltungskalen...
2,Kino: Er kann's nicht lassen,2024-09-24T15:00,15:00 Uhr,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...
3,"Interkulturelle Woche: Lesung „Neue Horizonte,...",2024-09-24T17:00,17:00 Uhr bis 19:00 Uhr,Bildungszentrum Vicelinviertel | 24534 Neumüns...,https://city-nms.de/events/veranstaltungskalen...
4,Kino: Das Fenster zum Hof,2024-09-24T20:00,20:00 Uhr,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...
...,...,...,...,...,...
60,Buntes Herbstfest im Tierpark,2024-10-03T10:00,10:00 Uhr,"Tierpark Neumünster | 24537 Neumünster, Geerdt...",https://city-nms.de/events/veranstaltungskalen...
61,Drachenfest,2024-10-03T12:00,12:00 Uhr bis 17:00 Uhr,"Flugplatz Neumünster | 24537 Neumünster, Baums...",https://city-nms.de/events/veranstaltungskalen...
62,Die Spur des Bildhauers. Wiedersehen mit Heinz...,2024-10-03T12:00,12:00 Uhr bis 18:00 Uhr,"Herbert Gerisch-Stiftung | 24536 Neumünster, B...",https://city-nms.de/events/veranstaltungskalen...
63,Neumünster singt und klingt,2024-10-03T19:00,19:00 Uhr bis 20:30 Uhr,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...


<b> Processing steps </b>

In [38]:
df['date'] = pd.to_datetime(df['date']).dt.date  # convert it in a normal date format.

In [40]:
import re

def func(x):
    if 'bis' in x:
        x = x.replace('bis', '-')   # replacing 'bis' with a '-' hyphen sign 
    x = re.sub(r'[^0-9:-]', '', x)  # keep only digits, colons, and hyphens. Remove the rest/
    return x

In [41]:
df['time'] = df['time'].apply(func)

In [42]:
df

Unnamed: 0,event,date,time,address,source
0,KUNSTFLECKEN - Aggregat,2024-09-23,19:00,"Werkhalle | 24534 Neumünster, Klosterstraße 16",https://city-nms.de/events/veranstaltungskalen...
1,"Stark wie Leo, Selbstbehauptungs-und Resilienz...",2024-09-24,14:00-15:00,"Stark wie Leo | 24536 Neumünster, Roschdohler ...",https://city-nms.de/events/veranstaltungskalen...
2,Kino: Er kann's nicht lassen,2024-09-24,15:00,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...
3,"Interkulturelle Woche: Lesung „Neue Horizonte,...",2024-09-24,17:00-19:00,Bildungszentrum Vicelinviertel | 24534 Neumüns...,https://city-nms.de/events/veranstaltungskalen...
4,Kino: Das Fenster zum Hof,2024-09-24,20:00,"Savoy - Kino, Konzerte &Theater in Bordesholm ...",https://city-nms.de/events/veranstaltungskalen...
...,...,...,...,...,...
60,Buntes Herbstfest im Tierpark,2024-10-03,10:00,"Tierpark Neumünster | 24537 Neumünster, Geerdt...",https://city-nms.de/events/veranstaltungskalen...
61,Drachenfest,2024-10-03,12:00-17:00,"Flugplatz Neumünster | 24537 Neumünster, Baums...",https://city-nms.de/events/veranstaltungskalen...
62,Die Spur des Bildhauers. Wiedersehen mit Heinz...,2024-10-03,12:00-18:00,"Herbert Gerisch-Stiftung | 24536 Neumünster, B...",https://city-nms.de/events/veranstaltungskalen...
63,Neumünster singt und klingt,2024-10-03,19:00-20:30,"Großflecken | 24534 Neumünster, Großflecken",https://city-nms.de/events/veranstaltungskalen...
