# Scraping Siegessäule events

## Setup

In [1]:
import sys
from pathlib import Path

root_dir = Path().absolute().parent.parent
sys.path.append(str(root_dir))

In [2]:
# Warning control
import json
import warnings

import controlflow as cf
from dotenv import load_dotenv

warnings.filterwarnings('ignore')

load_dotenv()

True

## Scraping 1 page

In [10]:
import requests
from bs4 import BeautifulSoup
from controlflow import tool
from pydantic import BaseModel, Field


class EventBasic(BaseModel):
    title: str = Field(..., description="The title of the event")
    summary: str = Field(..., description="A short summary of the event")
    detail_url: str = Field(..., description="The URL of the event detail page")

@tool
def scrape_main_content(url: str) -> str:
    """
    Scrape the main content from the provided URL.
    Returns the HTML content as a string.
    """
    # Send GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the main content area (adjust these selectors based on actual website)
    main_content = soup.find('main') \
                   or soup.find('div', class_='content') \
                   or soup.find('div', id='content')
    
    # If no main content found, use the entire body
    if not main_content:
        main_content = soup.body
    
    # Return the HTML content as a string
    return str(main_content)
    

@cf.flow
def scrape_siegessaeule_flow(url: str) -> list[EventBasic]:

    scrape_events = cf.Task(
        objective="Scrape events from the provided URL",
        instructions="Identify the events on the page of the provided URL and \
            return the title, summary, and detail URL for each event",
        context={"url": url},
        tools=[scrape_main_content],
        result_type=list[EventBasic],
    )

    events = scrape_events.run()
    
    return events


# url = "https://www.siegessaeule.de/en/events/?date=2025-01-21"
# events = scrape_siegessaeule_flow(url)

# print(json.dumps([event.model_dump() for event in events], indent=2))

## Scraping multiple pages

In [4]:
# constructing a list of urls
# url format: "https://www.siegessaeule.de/en/events/?date=YYYY-MM-DD"

from datetime import date, timedelta
from typing import List


def construct_siegessaeule_url(target_date: date) -> str:
    """
    Construct a Siegessäule events URL for a specific date.
    
    Args:
        target_date: The date to get events for
        
    Returns:
        The complete URL for that date
    """
    base_url = "https://www.siegessaeule.de/en/events/"
    return f"{base_url}?date={target_date.strftime('%Y-%m-%d')}"

def get_urls_for_date_range(start_date: date, end_date: date) -> List[str]:
    """
    Generate URLs for a range of dates.
    
    Args:
        start_date: First date to include
        end_date: Last date to include
        
    Returns:
        List of URLs, one for each date in the range
    """
    urls = []
    current_date = start_date
    
    while current_date <= end_date:
        urls.append(construct_siegessaeule_url(current_date))
        current_date += timedelta(days=1)
        
    return urls

def get_urls_next_days(days: int) -> List[str]:
    """
    Generate URLs for today and the next `days` days.
    
    Returns:
        List of `days` URLs, one for each day
    """
    today = date.today()
    end_date = today + timedelta(days=days-1)
    return get_urls_for_date_range(today, end_date)


In [5]:
urls = get_urls_next_days(2)
print("Generated URLs:")
for url in urls:
    print(f"- {url}")

Generated URLs:
- https://www.siegessaeule.de/en/events/?date=2025-01-22
- https://www.siegessaeule.de/en/events/?date=2025-01-23


In [7]:
# Reduce logging
cf.settings.log_level = 'INFO'

# urls = get_urls_next_days(2)
# events = []
# for url in urls:
#     events.extend(scrape_siegessaeule_flow(url))

# print(json.dumps([event.model_dump() for event in events], indent=2))


[
  {
    "title": "\u00d6ffentliche Probe: Carmen",
    "summary": "Queer adaptation of the classic opera",
    "detail_url": "https://www.siegessaeule.de/en/events/kultur/offentliche-probe-carmen/2025-01-22/19:30/"
  },
  {
    "title": "Dynamische Meditation",
    "summary": "Ein 5 Phasen-Prozess, Aufladung, Katharsis, Erdung, Meditation, Feiern, Mit Armin & Team",
    "detail_url": "https://www.siegessaeule.de/en/events/mix/dynamische-meditation/2025-01-22/07:30/"
  },
  {
    "title": "Projektband f\u00fcr Frauen* und Queers",
    "summary": "Music and band workshop for women and queers, every Wednesday",
    "detail_url": "https://www.siegessaeule.de/en/events/mix/frizu-lounge---23/2025-01-22/16:30/"
  },
  {
    "title": "Tests auf HIV/STIs",
    "summary": "HIV/STI testing, no registration required, anonym",
    "detail_url": "https://www.siegessaeule.de/en/events/mix/hivsti-schnell-labortests-24/2025-01-22/17:00/"
  },
  {
    "title": "Psychologische Beratung",
    "summary":

## Enriching the events
Till now we can scrape the events from the website for several days. But we only have the title, summary and detail URL.

We want to enrich the events with more information by visiting the detail URL and extract information like:
- description; the description of the event
- location; the location of the event
- start_time; the start time of the event
- end_time; the end time of the event
- organizer; the organizer of the event
- source; the source of the event (e.g. "siegessaeule")
- source_url; where the event was found
- detail_url; the url of the event detail page
- event_url; the main event page which might be on another website
- image_url; the url of the main image of the event
- tags; the tags of the event
- attendees; the number of attendees
- price; the price of the event
- status; the status of the event

### 1) Get the main details we want from an event detail page

In [13]:
from datetime import datetime
from decimal import Decimal
from typing import Optional

from pydantic import Field, HttpUrl


class EventFull(EventBasic):
    """Complete event information including detail page data"""
    description: Optional[str] = Field(
        None,
        description="The full description of the event"
    )
    location: Optional[str] = Field(None, description="The location of the event")
    start_time: datetime = Field(..., description="When the event starts")
    end_time: Optional[datetime] = Field(None, description="When the event ends")
    organizer: Optional[str] = Field(None, description="Who is organizing the event")
    event_url: Optional[HttpUrl] = Field(
        None, 
        description="External URL (e.g., Facebook event, venue website)"
    )
    image_url: Optional[HttpUrl] = Field(
        None, 
        description="URL to the main event image found on the detail page"
    )
    attendees: Optional[int] = Field(
        None, 
        description="Number of attendees if available"
    )
    price: Optional[Decimal] = Field(
        None, 
        description="Event price in euros"
    )

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat(),
            Decimal: lambda v: str(v)
        }



# let's test the scraping of the main content of an event detail page
detail_url = "https://www.siegessaeule.de/en/events/sex/naked-sex-party-lab/2025-01-23/21:00/"
event_detail = cf.run(
    "Get the main content of the event detail page of the provided URL",
    tools=[scrape_main_content],
    context={"url": detail_url},
    result_type=EventFull
)

print(event_detail.model_dump_json(indent=2))

{
  "title": "Naked Sex Party",
  "summary": "A Thursday night party at Berlin’s most famous cruising and sex club with a naked dress code.",
  "detail_url": "https://www.siegessaeule.de/en/events/sex/naked-sex-party-lab/2025-01-23/21:00/",
  "description": "The line to get into the Thursday night party in Berlin’s most famous cruising and sex club is almost as long as the weekend line in front of Berghain. The dress code: naked – so slip a little bottle of lube (or whatever) into your socks and off you go! Entry until 23:00. DC: fully naked, shoes only.",
  "location": "Lab.oratory, Am Wriezener Bahnhof, 10243 Berlin",
  "start_time": "2025-01-23T20:00:00",
  "end_time": null,
  "organizer": "Lab.oratory",
  "event_url": "http://www.lab-oratory.de/",
  "image_url": "https://cdn.siegessaeule.de/images/Berghain__guido_woller.9d0010a0.fill-720x360.jpg",
  "attendees": null,
  "price": null
}


In [None]:
@tool
def scrape_main_content(url: str) -> str:
    """
    Scrape the main content from the provided URL.
    Returns the HTML content as a string.
    """
    # Send GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the main content area (adjust these selectors based on actual website)
    main_content = soup.find('main') \
                   or soup.find('div', class_='content') \
                   or soup.find('div', id='content')
    
    # If no main content found, use the entire body
    if not main_content:
        main_content = soup.body
    
    # Return the HTML content as a string
    return str(main_content)
    

@cf.flow
def scrape_siegessaeule_flow(url: str) -> list[Event]:

    scrape_events = cf.Task(
        objective="Scrape events from the provided URL",
        instructions="Identify the events on the page of the provided URL and \
            return the title, summary, and detail URL for each event",
        context={"url": url},
        tools=[scrape_main_content],
        result_type=list[Event],
    )

    events = scrape_events.run()
    
    return events


## Enrich the event data
We want to enrich the event data with e.g. tags that help us filter the events.

In [None]:
class EventEnriched(EventFull):
    tags: Optional[str] = Field(
        None, 
        description="Comma-separated list of event categories/tags"
    )