# Download individual event pages

In [2]:
import requests
import os
import time

# Define the base URL
base_url = "https://waterkantfestival2024.sched.com/"

# Specify the file path for the .txt file
file_path = "links.txt"

# Create a directory to store the downloaded HTML files
os.makedirs('html_pages', exist_ok=True)

# Define the path to the folder
folder_path = 'html_pages'

# List all files in the directory without their extensions
file_names_without_extension = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# Function to fetch and save the webpage with retry logic
def fetch_and_save(url_extension):
    # Sanitize the URL extension to use as a filename
    sanitized_filename = url_extension.strip().replace('/', '_')
    # Construct the full URL
    full_url = base_url + url_extension.strip()
    # Attempt to fetch the webpage up to 3 times if a 503 error is encountered
    attempts = 3
    for attempt in range(attempts):
        try:
            response = requests.get(full_url, headers=headers)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            # Save the webpage to an HTML file
            with open(os.path.join('html_pages', sanitized_filename + '.html'), 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"Saved {sanitized_filename}.html successfully.")
            break  # Exit the loop if successful
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 503 and attempt < attempts - 1:
                print(f"Attempt {attempt + 1} failed with 503 error, retrying after 5 seconds...")
                time.sleep(5)  # Wait for 5 seconds before retrying
            else:
                print(f"Failed to retrieve {url_extension.strip()}: {e}")
        except requests.RequestException as e:
            print(f"Failed to retrieve {url_extension.strip()}: {e}")
            break  # Exit loop if there's a non-retriable error

# Read the .txt file and process each line
with open(file_path, 'r', encoding='utf-8') as file:
    for num, line in enumerate(file):
        print(num)
        # Use the sanitized filename to check if the file already exists
        sanitized_name = line.strip().replace('/', '_')
        if sanitized_name in file_names_without_extension:
            print("skipped")
            continue
            
        print(line)
        fetch_and_save(line)


0
event/1dM8F/funding-for-impact-opening-workshop-for-phds

Saved event_1dM8F_funding-for-impact-opening-workshop-for-phds.html successfully.
1
event/1dJcw/masterclass-souveranitat-bedeutung-tipps-fur-frauen

Saved event_1dJcw_masterclass-souveranitat-bedeutung-tipps-fur-frauen.html successfully.
2
event/1dJct/masterclass-engagement-marketing-wie-unternehmen-die-nachste-generation-kauferinnen-gezielt-ansprechen

Saved event_1dJct_masterclass-engagement-marketing-wie-unternehmen-die-nachste-generation-kauferinnen-gezielt-ansprechen.html successfully.
3
event/1dJck/masterclass-futures-snack-praktische-einfuhrung-in-futures-thinking

Saved event_1dJck_masterclass-futures-snack-praktische-einfuhrung-in-futures-thinking.html successfully.
4
event/1dJcq/masterclass-how-to-build-a-brand-with-purpose-in-the-age-of-ai

Saved event_1dJcq_masterclass-how-to-build-a-brand-with-purpose-in-the-age-of-ai.html successfully.
5
event/1dJcn/masterclass-verhandlungsfuhrung-fur-fuhrungskrafte-von-unternehm

Check that amount of html pages is equal to amount of links

In [3]:
import os

# Define the path to the folder
folder_path = 'html_pages'

# List all files in the directory without their extensions
file_names_without_extension = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Print the number of files
print(f"Number of files in '{folder_path}': {len(file_names_without_extension)}")
# Print the filenames without extensions
#print("Filenames without extension:", file_names_without_extension)


Number of files in 'html_pages': 106


In [4]:
# Specify the path to your .txt file
file_path = 'links.txt'

# Initialize a counter
line_count = 0

# Open the file and read through each line to count them
with open(file_path, 'r') as file:
    for line in file:
        line_count += 1

# Print the total number of lines
print(f"The file has {line_count} lines.")


The file has 106 lines.


Let's make sure there are no duplicates in our links

In [5]:
# Path to the file
file_path = 'links.txt'

# Dictionary to hold lines and their line numbers
lines_seen = {}
duplicates = {}

# Read the file
with open(file_path, 'r') as file:
    for line_number, line in enumerate(file, 1):
        # Remove any leading/trailing whitespace
        stripped_line = line.strip()
        
        # Check if the line is already seen
        if stripped_line in lines_seen:
            # If it's a duplicate, add the current line number
            if stripped_line in duplicates:
                duplicates[stripped_line].append(line_number)
            else:
                duplicates[stripped_line] = [lines_seen[stripped_line], line_number]
        else:
            # If it's not seen, remember the line with its line number
            lines_seen[stripped_line] = line_number

# Output the results
for line, line_nums in duplicates.items():
    print(f"Duplicate line: '{line}' found on line numbers: {line_nums}")


In [6]:
duplicates

{}

## Processing the html files to json

In [25]:
import os
from bs4 import BeautifulSoup

# Directory containing HTML files
directory_path = 'html_pages'

# Iterate through each file in the directory
for filename in os.listdir(directory_path)[2:]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and read the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract content of divs by their class
        tip_description = soup.find("div", class_="tip-description").get_text() if soup.find("div", class_="tip-description") else "No description found"
        # find the div with class 'tip-roles'
        # find all divs that have the class "sched-person-session"
        sessions = soup.find_all("div", class_="sched-person-session")
        
        # extract the text from the h2 element within each session div, if it exists
        speakers = [session.find("h2").get_text(strip=True) for session in sessions if session.find("h2")]
        #tip_roles = soup.find("div", class_="tip-roles").get_text() if soup.find("div", class_="tip-roles") else "No roles found"
        event_details = soup.find("div", class_="sched-event-details-timeandplace").get_text() if soup.find("div", class_="sched-event-details-timeandplace") else "No event details found"
        event_type = soup.find("div", class_="sched-event-type").get_text() if soup.find("div", class_="sched-event-type") else "No event type found"
        a_tag = soup.select_one('span[class^="event"] a')

        # Extract the text from the found <a> tag
        if a_tag:
            title = a_tag.get_text(strip=True)
        else:
            print("No matching element found.")
        # Print extracted data
        print(f"File: {filename}")
        print("Title:", title)
        print("Description:", tip_description)
        print("Speakers:", speakers)
        #print("Event Details:", event_details)
        print("date:", event_details.strip().split('\n')[0])
        print("location:", event_details.strip().split('\n')[1])
        print("type:", event_type.strip())
        print("-" * 40)  # Separator for readability between files
        break


File: event_1dJb9_verwaltung-der-zukunft.html
Title: Verwaltung der Zukunft
Description: No description found
Speakers: ['Thilak Mahendran']
date: Friday June 14, 2024 14:00 - 14:45 CEST
location: Tiny Rathaus
type: Zukunft der Verwaltung, talk
----------------------------------------


In [26]:
import os
import json
from bs4 import BeautifulSoup

# Directory containing HTML files
directory_path = 'html_pages'
output_directory = 'json'

# Ensure the output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Iterate through each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and read the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract content of divs by their class
        tip_description = soup.find("div", class_="tip-description").get_text() if soup.find("div", class_="tip-description") else "No description found"
        sessions = soup.find_all("div", class_="sched-person-session")
        speakers = [session.find("h2").get_text(strip=True) for session in sessions if session.find("h2")]
        event_details = soup.find("div", class_="sched-event-details-timeandplace").get_text() if soup.find("div", class_="sched-event-details-timeandplace") else "No event details found"
        event_type = soup.find("div", class_="sched-event-type").get_text() if soup.find("div", class_="sched-event-type") else "No event type found"
        a_tag = soup.select_one('span[class^="event"] a')
        
        # Extract the text from the found <a> tag
        title = a_tag.get_text(strip=True) if a_tag else "No title found"

        # Prepare data dictionary
        data = {
            "Title": title,
            "Description": tip_description,
            "Speakers": speakers,
            "Date": event_details.strip().split('\n')[0],
            "Location": event_details.strip().split('\n')[1],
            "Type": event_type.strip()
        }

        # Define the output JSON filename
        output_filename = os.path.join(output_directory, os.path.splitext(filename)[0] + '.json')
        
        # Write data to a JSON file
        with open(output_filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4)
