# Scraping Siegessäule events

## Setup

In [16]:
import sys
from pathlib import Path

root_dir = Path().absolute().parent.parent
sys.path.append(str(root_dir))

In [17]:
# Warning control
import json
import warnings

import controlflow as cf
from dotenv import load_dotenv

warnings.filterwarnings('ignore')

load_dotenv()

True

In [18]:
import os
print(os.getenv('CONTROLFLOW_LLM_MODEL'))


openai/gpt-4o-mini


## Scraping 1 page

In [37]:
import requests
from bs4 import BeautifulSoup
from controlflow import tool
from pydantic import BaseModel, Field, HttpUrl, validator


class EventBasic(BaseModel):
    title: str = Field(..., description="The title of the event")
    summary: str = Field(..., description="A short summary of the event")
    detail_url: HttpUrl = Field(..., description="The URL of the event detail page")

    @classmethod
    def set_base_url(cls, base_url: str):
        cls._base_url = base_url
        
    @validator('detail_url', pre=True)
    def ensure_absolute_url(cls, v):
        if isinstance(v, str) and v.startswith('/'):
            base_url = getattr(cls, '_base_url', None)
            if not base_url:
                raise ValueError("Base URL not set. Call EventBasic.set_base_url() first")
            return f"{base_url.rstrip('/')}{v}"
        return v

    class Config:
        json_encoders = {
            HttpUrl: str
        }

@tool
def scrape_sex_clubs_events(url: str):
    """
    Scrape specific sections (Sex or Clubs) from the provided URL.
    Returns the HTML content as a string.
    """
    # Send GET request to the URL
    response = requests.get(url)
    response.raise_for_status()
    
    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all sections that have h3 headers
    relevant_sections = []
    sections = soup.find_all('section', class_='content listing')
    
    for section in sections:
        h3_tag = section.find('h3')
        if h3_tag and any(category in h3_tag.text for category in ['Sex', 'Clubs']):
            relevant_sections.append(str(section))
    
    # Combine all relevant sections
    if not relevant_sections:
        return ""  # Return empty string if no relevant sections found
    
    return "\n".join(relevant_sections)

# Test the scraping of the sex clubs events
url = "https://www.siegessaeule.de/en/events/?date=2025-01-21"
EventBasic.set_base_url("https://www.siegessaeule.de")
events = cf.run(
    "Scrape the sex and clubs events from the provided URL",
    tools=[scrape_sex_clubs_events],
    context={"url": url},
    result_type=list[EventBasic],
)

print(json.dumps([event.model_dump(mode='json') for event in events], indent=2))

[
  {
    "title": "OnStage Karaoke",
    "summary": "Queerfriendly karaoke hosted by various KJ",
    "detail_url": "https://www.siegessaeule.de/en/events/clubs/monster-ronsons-onstage-karaoke-25/2025-01-21/21:00/"
  },
  {
    "title": "Cum & Fuck",
    "summary": "\"XXL Fuck Factory\", Gay Cruising",
    "detail_url": "https://www.siegessaeule.de/en/events/sex/cum-fuck/2025-01-21/12:00/"
  },
  {
    "title": "Queerbeet",
    "summary": "FTM/MTF Play+Sexparty, All Genders and Bodytypes!",
    "detail_url": "https://www.siegessaeule.de/en/events/sex/queerbeet/2025-01-21/19:00/"
  },
  {
    "title": "SM am Dienstag",
    "summary": "BDSM night, men* only, DC: no, fetish welcome",
    "detail_url": "https://www.siegessaeule.de/en/events/sex/sm-am-dienstag-24/2025-01-21/19:00/"
  },
  {
    "title": "TriebSex",
    "summary": "Gay cruising, DC: underwear, naked",
    "detail_url": "https://www.siegessaeule.de/en/events/sex/triebsex-24/2025-01-21/20:00/"
  }
]


## Constructing URLs (for scraping multiple pages)

In [38]:
# constructing a list of urls
# url format: "https://www.siegessaeule.de/en/events/?date=YYYY-MM-DD"

from datetime import date, timedelta
from typing import List


def construct_siegessaeule_url(target_date: date) -> str:
    """
    Construct a Siegessäule events URL for a specific date.
    
    Args:
        target_date: The date to get events for
        
    Returns:
        The complete URL for that date
    """
    base_url = "https://www.siegessaeule.de/en/events/"
    return f"{base_url}?date={target_date.strftime('%Y-%m-%d')}"

def get_urls_for_date_range(start_date: date, end_date: date) -> List[str]:
    """
    Generate URLs for a range of dates.
    
    Args:
        start_date: First date to include
        end_date: Last date to include
        
    Returns:
        List of URLs, one for each date in the range
    """
    urls = []
    current_date = start_date
    
    while current_date <= end_date:
        urls.append(construct_siegessaeule_url(current_date))
        current_date += timedelta(days=1)
        
    return urls

def get_urls_next_days(days: int) -> List[str]:
    """
    Generate URLs for today and the next `days` days.
    
    Returns:
        List of `days` URLs, one for each day
    """
    today = date.today()
    end_date = today + timedelta(days=days-1)
    return get_urls_for_date_range(today, end_date)


In [39]:
urls = get_urls_next_days(2)
print("Generated URLs:")
for url in urls:
    print(f"- {url}")

Generated URLs:
- https://www.siegessaeule.de/en/events/?date=2025-01-23
- https://www.siegessaeule.de/en/events/?date=2025-01-24


## Enriching the events
Till now we can scrape the events from the website for several days. But we only have the title, summary and detail URL.

We want to enrich the events with more information by visiting the detail URL and extract information like:
- description; the description of the event
- location; the location of the event
- start_time; the start time of the event
- end_time; the end time of the event
- organizer; the organizer of the event
- source; the source of the event (e.g. "siegessaeule")
- source_url; where the event was found
- detail_url; the url of the event detail page
- event_url; the main event page which might be on another website
- image_url; the url of the main image of the event
- tags; the tags of the event
- attendees; the number of attendees
- price; the price of the event
- status; the status of the event

### 1) Get the main details we want from an event detail page

In [40]:
@tool
def scrape_event_details(url: str) -> str:
    """
    Scrape specific event details and return them in a clean, markdown-like format.
    """
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize details dictionary
    details = []
    
    # Get title
    if title_elem := soup.find('h3', class_='svelte-5l0ta8'):
        details.append(f"# {title_elem.text.strip()}")
    
    # Get date and location
    if info_list := soup.find('ul', class_='info-list'):
        for item in info_list.find_all('li'):
            details.append(item.text.strip())
    
    # Get main description
    if desc_elem := soup.find('div', class_='typography--first-p--emphasized'):
        # Extract text from paragraphs
        for p in desc_elem.find_all('p'):
            if p_text := p.text.strip():
                details.append(p_text)
    
    # Get tags
    tags = []
    for tag in soup.find_all('span', class_='svelte-1s4c2pi'):
        if tag_text := tag.text.strip():
            tags.append(tag_text)
    if tags:
        details.append("Tags: " + " ".join(tags))
    
    # Get venue details
    if venue_section := soup.find('section', recursive=False):
        if venue_name := venue_section.find('h3', class_='svelte-5l0ta8'):
            details.append(f"\nVenue: {venue_name.text.strip()}")
        
        # Get venue address and contact
        if venue_info := venue_section.find('ul', class_='info-list'):
            for item in venue_info.find_all('li'):
                if item_text := item.text.strip():
                    details.append(item_text)
    
    # Join all details with newlines
    return "\n".join(details)

In [42]:
# # test what gets scraped from the detail page
# detail_url = "https://www.siegessaeule.de/en/events/sex/naked-sex-party-lab/2025-01-23/21:00/"

# event_detail = cf.run(
#     "Get the main content of the event detail page of the provided URL in raw HTML",
#     tools=[scrape_event_details],
#     context={"url": detail_url},
# )

# print(event_detail)

In [43]:
# Define EventFull and test it for one event detail page
from datetime import datetime
from decimal import Decimal
from typing import Optional

from pydantic import Field, HttpUrl


class EventFull(EventBasic):
    """Complete event information including detail page data"""
    description: Optional[str] = Field(
        None,
        description="The full description of the event"
    )
    location: Optional[str] = Field(None, description="The location of the event")
    start_time: datetime = Field(..., description="When the event starts")
    end_time: Optional[datetime] = Field(None, description="When the event ends")
    organizer: Optional[str] = Field(None, description="Who is organizing the event")
    event_url: Optional[HttpUrl] = Field(
        None, 
        description="External URL (e.g., Facebook event, venue website)"
    )
    image_url: Optional[HttpUrl] = Field(
        None, 
        description="URL to the main event image found on the detail page"
    )
    attendees: Optional[int] = Field(
        None, 
        description="Number of attendees if available"
    )
    price: Optional[Decimal] = Field(
        None, 
        description="Event price in euros"
    )
    original_tags: list[str] = Field(
        [],
        description="The original tags of the event found on the detail page"
    )

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat(),
            Decimal: lambda v: str(v)
        }



# let's test the scraping of the main content of an event detail page
detail_url = "https://www.siegessaeule.de/en/events/sex/naked-sex-party-lab/2025-01-23/21:00/"
event_detail = cf.run(
    "Get the main content of the event detail page of the provided URL",
    tools=[scrape_event_details],
    context={"url": detail_url},
    result_type=EventFull
)

print(event_detail.model_dump_json(indent=2))

{
  "title": "Naked Sex Party",
  "summary": "The line to get into the Thursday night party in Berlin’s most famous cruising and sex club is almost as long as the weekend line in front of Berghain. The dresscode: naked – so slip a little bottle of lube (or whatever) into your socks and off you go! Entry until 23:00",
  "detail_url": "https://www.siegessaeule.de/en/events/sex/naked-sex-party-lab/2025-01-23/21:00/",
  "description": "Entry till 23:00. DC: fully naked, shoes only.",
  "location": "Lab.oratory",
  "start_time": "2025-01-23T20:00:00",
  "end_time": null,
  "organizer": null,
  "event_url": null,
  "image_url": null,
  "attendees": null,
  "price": null,
  "original_tags": [
    "#gay",
    "#Friedrichshain",
    "#sex",
    "#sexparty",
    "#Friedrichshain",
    "#cruising",
    "#schwul",
    "#darkroom",
    "#sex",
    "#gay"
  ]
}


In [27]:
# flow to scrape all events for a given url, and subsequently 
# complete them with the detail page data

@cf.flow
def scrape_and_complete_siegessaeule_events(url: str) -> list[EventFull]:

    basic_events = cf.run(
        objective="Scrape events from the provided URL",
        instructions="Identify the events on the page of the provided URL and \
            return the title, summary, and full detail URL for each event",
        context={"url": url},
        tools=[scrape_sex_clubs_events],
        result_type=list[EventBasic],
    )

    full_events = cf.run(
        objective="Enrich the events with data from their detail pages",
        instructions="Scrape the main content of each event detail page \
                      to enrich the event data",
        context={"events": basic_events},
        tools=[scrape_event_details],
        result_type=list[EventFull],
    )
    
    return full_events


In [28]:
urls = get_urls_next_days(2)
print(urls)

['https://www.siegessaeule.de/en/events/?date=2025-01-23', 'https://www.siegessaeule.de/en/events/?date=2025-01-24']


In [29]:
events = []
for url in urls:
    events.extend(scrape_siegessaeule_full_events(url))


ValueError: 1 task failed: - Task #2cb97788 ("Scrape events from the provided URL"): The detail_url values provided are relative URLs instead of absolute URLs. They need to be prefixed with the base URL 'https://www.siegessaeule.de'.

In [69]:
print(json.dumps([event.model_dump() for event in events], indent=2, default=str))

[
  {
    "title": "OnStage Karaoke",
    "summary": "Queerfriendly karaoke hosted by various KJ",
    "detail_url": "https://www.siegessaeule.de/en/events/clubs/monster-ronsons-onstage-karaoke-25/2025-01-22/21:00/",
    "description": "Almost everyday you can showcase your singing talents in Friedrichhains most well known karaoke club. More information: karaokemonster. Queerfriendly karaoke hosted by various KJ.",
    "location": "Monster Ronson's Ichiban Karaoke",
    "start_time": "2025-01-22 20:00:00",
    "end_time": null,
    "organizer": null,
    "event_url": null,
    "image_url": null,
    "attendees": null,
    "price": null,
    "original_tags": [
      "#karaoke",
      "#Kreuzberg",
      "#queerfriendly",
      "#gayfriendly",
      "#karaoke",
      "#schwul",
      "#queer",
      "#gay",
      "#LGBTI*",
      "#Friedrichshain",
      "#nightlife"
    ]
  },
  {
    "title": "Kinotag",
    "summary": "Men Only",
    "detail_url": "https://www.siegessaeule.de/en/events

## Enrich the event data
We want to enrich the event data with e.g. tags that help us filter the events.

In [70]:
class EventEnriched(EventFull):
    fomo_score: Optional[int] = Field(
        None, 
        description="The FOMO score of the event"
    )
    fomo_score_reason: Optional[str] = Field(
        None, 
        description="The reason for the FOMO score"
    )