#### Web scraping Nordfriesland

In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pytz 
import pandas as pd
import numpy as np

# locale.setlocale(locale.LC_TIME, 'de_DE.utf8') 
germany_tz = pytz.timezone('Europe/Berlin')
current_date = datetime.now(germany_tz).date()  # Get the current date and time in the German time zone.
ten_days_from_today = current_date + timedelta(days=10)

base_url = "https://www.nordfrieslandkalender.de"
links_list = [] 

page = 1
is_true = True
while is_true:
    url = f"https://www.nordfrieslandkalender.de/Veranstaltungen?eps=24&az=all&page={page}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(f'Scraping page {page}')

    blocks = soup.find_all('div', class_="item_wrapper")
    for block in blocks:
        if block.select_one('span.event_date_from'):
            start_date = block.select_one('span.event_date_from').text.strip()
        else:
            start_date = block.select_one('span.event_date_to').text.strip()

        if len(start_date) == 6:
            start_date += str(datetime.now().year)
        if len(start_date) == 10:                   # only check date if it's in a normal format with month and year.
            check_date = datetime.strptime(start_date, "%d.%m.%Y").date()

        if ten_days_from_today < check_date:
            is_true = False
            break

        link = base_url + block.select_one("a.event_teaser_link.event_teaser_title_link")['href']
        links_list.append(link)

    if not is_true:
        break
    page += 1
    
print('Done')

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Done


In [2]:
len(links_list)

202

In [None]:
event_list = []

for link in links_list:
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    def get_text_or_none(selector):   # the function checks if an element exists.
        element = soup.select_one(selector) 
        return element.text.strip() if element else np.nan
    
    # def get_text_or_none(tag, **kwargs):   # the function checks if an element exists.
    #     element = soup.find(tag, **kwargs)
    #     return element.text.strip() if element else np.nan

    title = soup.select_one("div#event_bezeichnung_wrapper").text.strip()  # '#' is used to search an element by 'id'.

    if soup.select_one("span#event_single_date"):
        start_date = soup.select_one("span#event_single_date").contents[-1].strip()
    else:
        start_date = soup.select_one("span.nextdate_date").text.strip()

    # if soup.select_one("span.event_time_from"):
    #     start_time = soup.select_one("span.event_time_from").text.strip()
    # else:
    #     start_time = np.nan
    start_time = get_text_or_none("span.event_time_from")  
    end_time = get_text_or_none("span.event_time_to")

    category = soup.select_one("div.event_rubrik").select_one("span.keyword_title").text.strip()
    address1 = str(get_text_or_none("div.addresse_name"))
    address2 = ", ".join([tag.text.replace('\n', ' ').strip() for tag in soup.select("div.row_wrapper")])
    our_event = {
            'Subject': title,
            'Start_date': start_date,
            'Start_time': start_time,
            'End_time': end_time,
            'Location': address1 + ', ' + address2,
            'Category': category
        }

    event_list.append(our_event)

In [8]:
df = pd.DataFrame(event_list)
df

Unnamed: 0,Subject,Start_date,Start_time,End_time,Location,Category
0,Die Fotografin,19.11.2024,20:00,22:00,"Kinocenter Husum, Neustadt 114, 25813 Husum, N...",Kino
1,Kreis Nordfriesland Wirtschaftsausschuss,19.11.2024,14:00,17:00,"Kreissitzungssaal Kreishaus, Marktstraße 6, 25...",Wissen
2,VHS: Lebenskompetenzen statt reiner Wissensver...,19.11.2024,16:30,18:00,"W.D.R.-Galerie (Eingang am Innenhafen), Am Fäh...",Wissen
3,Weltreise,19.11.2024,19:30,,"Kinocenter Husum, Neustadt 114, 25813 Husum, N...",Kino
4,"Weltreise zu Fairtrade-Produzenten, Naturwunde...",19.11.2024,19:30,,"Filmklub e.V. im Kino-Center Husum, Neustadt 1...",Kino
...,...,...,...,...,...,...
197,Grachtenweihnacht,29.11.2024,14:00,22:00,"Marktplatz Friedrichstadt, Am Markt, 25840 Fri...",Events
198,Jööltir ön Muasem,29.11.2024,14:00,18:00,"Muasem Hüs (Morsum), Bi Miiren 17, 25980 Sylt,...",Märkte
199,Lister Weihnachtsmarkt,29.11.2024,18:00,21:00,"Kurverwaltung List auf Sylt, Landwehrdeich 1, ...",Märkte
200,Sauna-Event in der Dünen-Therme,29.11.2024,12:00,23:00,"Dünen-Therme Freizeit- und Erlebnisbad, Maleen...",Gesundheit


In [15]:
df['Start_date'] = pd.to_datetime(df['Start_date'], format='%d.%m.%Y').dt.strftime('%Y-%m-%d')
df.head(1)

Unnamed: 0,Subject,Start_date,Start_time,End_time,Location,Category
0,Die Fotografin,2024-11-19,20:00,22:00,"Kinocenter Husum, Neustadt 114, 25813 Husum, N...",Kino


In [16]:
df['End_date'] = df['Start_date']
df.head()

Unnamed: 0,Subject,Start_date,Start_time,End_time,Location,Category,End_date
0,Die Fotografin,2024-11-19,20:00,22:00,"Kinocenter Husum, Neustadt 114, 25813 Husum, N...",Kino,2024-11-19
1,Kreis Nordfriesland Wirtschaftsausschuss,2024-11-19,14:00,17:00,"Kreissitzungssaal Kreishaus, Marktstraße 6, 25...",Wissen,2024-11-19
2,VHS: Lebenskompetenzen statt reiner Wissensver...,2024-11-19,16:30,18:00,"W.D.R.-Galerie (Eingang am Innenhafen), Am Fäh...",Wissen,2024-11-19
3,Weltreise,2024-11-19,19:30,,"Kinocenter Husum, Neustadt 114, 25813 Husum, N...",Kino,2024-11-19
4,"Weltreise zu Fairtrade-Produzenten, Naturwunde...",2024-11-19,19:30,,"Filmklub e.V. im Kino-Center Husum, Neustadt 1...",Kino,2024-11-19
