In [31]:
import requests
import os
import time

# Define the base URL
base_url = "https://waterkantfestival2023.sched.com/"

# Specify the file path for the .txt file
file_path = "links.txt"

# Create a directory to store the downloaded HTML files
os.makedirs('html_pages', exist_ok=True)

# Define the path to the folder
folder_path = 'html_pages'

# List all files in the directory without their extensions
file_names_without_extension = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# Function to fetch and save the webpage with retry logic
def fetch_and_save(url_extension):
    # Sanitize the URL extension to use as a filename
    sanitized_filename = url_extension.strip().replace('/', '_')
    # Construct the full URL
    full_url = base_url + url_extension.strip()
    # Attempt to fetch the webpage up to 3 times if a 503 error is encountered
    attempts = 3
    for attempt in range(attempts):
        try:
            response = requests.get(full_url, headers=headers)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            # Save the webpage to an HTML file
            with open(os.path.join('html_pages', sanitized_filename + '.html'), 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"Saved {sanitized_filename}.html successfully.")
            break  # Exit the loop if successful
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 503 and attempt < attempts - 1:
                print(f"Attempt {attempt + 1} failed with 503 error, retrying after 5 seconds...")
                time.sleep(5)  # Wait for 5 seconds before retrying
            else:
                print(f"Failed to retrieve {url_extension.strip()}: {e}")
        except requests.RequestException as e:
            print(f"Failed to retrieve {url_extension.strip()}: {e}")
            break  # Exit loop if there's a non-retriable error

# Read the .txt file and process each line
with open(file_path, 'r', encoding='utf-8') as file:
    for num, line in enumerate(file):
        print(num)
        # Use the sanitized filename to check if the file already exists
        sanitized_name = line.strip().replace('/', '_')
        if sanitized_name in file_names_without_extension:
            print("skipped")
            continue
            
        print(line)
        fetch_and_save(line)


0
event/1NpjP/netzwerktreffen-kielscn-invitation-only

Saved event_1NpjP_netzwerktreffen-kielscn-invitation-only.html successfully.
1
event/1NHsJ/exhibition-opening

Saved event_1NHsJ_exhibition-opening.html successfully.
2
event/1MvLo/opening-ceremony-dancing-sounds-performance-new-mobility-premiere

Saved event_1MvLo_opening-ceremony-dancing-sounds-performance-new-mobility-premiere.html successfully.
3
event/1Nmox/zine-release

Saved event_1Nmox_zine-release.html successfully.
4
event/1NZ5h/imagine-swings-reale-schaukel-trifft-auf-virtuelle-realitat-real-swing-meets-virtual-reality

Saved event_1NZ5h_imagine-swings-reale-schaukel-trifft-auf-virtuelle-realitat-real-swing-meets-virtual-reality.html successfully.
5
event/1Ni1E/interesse-an-ai-und-chat-gpt

Saved event_1Ni1E_interesse-an-ai-und-chat-gpt.html successfully.
6
event/1NNeI/starterkitchen-skx-accelerator-demo-day

Saved event_1NNeI_starterkitchen-skx-accelerator-demo-day.html successfully.
7
event/1MvMI/investment-vernissage-

In [32]:
import os

# Define the path to the folder
folder_path = 'html_pages'

# List all files in the directory without their extensions
file_names_without_extension = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Print the number of files
print(f"Number of files in '{folder_path}': {len(file_names_without_extension)}")
# Print the filenames without extensions
#print("Filenames without extension:", file_names_without_extension)


Number of files in 'html_pages': 112


In [25]:
# Specify the path to your .txt file
file_path = 'links.txt'

# Initialize a counter
line_count = 0

# Open the file and read through each line to count them
with open(file_path, 'r') as file:
    for line in file:
        line_count += 1

# Print the total number of lines
print(f"The file has {line_count} lines.")


The file has 112 lines.


In [27]:
# Path to the file
file_path = 'links.txt'

# Dictionary to hold lines and their line numbers
lines_seen = {}
duplicates = {}

# Read the file
with open(file_path, 'r') as file:
    for line_number, line in enumerate(file, 1):
        # Remove any leading/trailing whitespace
        stripped_line = line.strip()
        
        # Check if the line is already seen
        if stripped_line in lines_seen:
            # If it's a duplicate, add the current line number
            if stripped_line in duplicates:
                duplicates[stripped_line].append(line_number)
            else:
                duplicates[stripped_line] = [lines_seen[stripped_line], line_number]
        else:
            # If it's not seen, remember the line with its line number
            lines_seen[stripped_line] = line_number

# Output the results
for line, line_nums in duplicates.items():
    print(f"Duplicate line: '{line}' found on line numbers: {line_nums}")


In [28]:
duplicates

{}

In [11]:
full_url = base_url + 'event/1NpjP/netzwerktreffen-kielscn-invitation-only'
    # Fetch the webpage
    
response = requests.get(full_url)
response.raise_for_status()  # Raises an HTTPError for bad responses
# Save the webpage to an HTML file

In [12]:
response.text

'\n\n\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\r\n  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<!--\r\n           _              _\r\n  ___  ___| |__   ___  __| |                 @@@@@@@@@@@@@@\r\n / __|/ __| \'_ \\ / _ \\/ _` |             @@@@@@@@@@@@@@@@@@@@@@\r\n \\__ \\ (__| | | |  __/ (_| |          @@@@@@@@@@@@@@@@@@@@@@@@@@@@\r\n |___/\\___|_| |_|\\___|\\__,_|        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r\n                                  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r\n                _               @@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@\r\n   ___ ___   __| | ___         @@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@\r\n  / __/ _ \\ / _` |/ _ \\       @@@@@@@@@@@@@@@@@@@@@@@@@@@   @@@@@@@@@@\r\n | (_| (_) | (_| |  __/      @@@@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@\r\n  \\___\\___/ \\__,_|\\___|      @@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@@@@\r\n                             @@@@@@@@@@@@@@@@@@@@@@@   @@@@@@@@@@@@@@@@@\r\n                             

In [152]:
import os
from bs4 import BeautifulSoup

# Directory containing HTML files
directory_path = 'html_pages'

# Iterate through each file in the directory
for filename in os.listdir(directory_path)[113:]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and read the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract content of divs by their class
        tip_description = soup.find("div", class_="tip-description").get_text() if soup.find("div", class_="tip-description") else "No description found"
        tip_roles = soup.find("div", class_="tip-roles").get_text() if soup.find("div", class_="tip-roles") else "No roles found"
        event_details = soup.find("div", class_="sched-event-details-timeandplace").get_text() if soup.find("div", class_="sched-event-details-timeandplace") else "No event details found"
        event_type = soup.find("div", class_="sched-event-type").get_text() if soup.find("div", class_="sched-event-type") else "No event type found"
        a_tag = soup.select_one('span[class^="event"] a')

        # Extract the text from the found <a> tag
        if a_tag:
            title = a_tag.get_text(strip=True)
        else:
            print("No matching element found.")
        # Print extracted data
        print(f"File: {filename}")
        print("Title:", title)
        print("Tip Description:", tip_description)
        print("Tip Roles:", tip_roles)
        print("Event Details:", event_details)
        print("Event Type:", event_type)
        print("-" * 40)  # Separator for readability between files
        break


In [172]:
import os
import json
from bs4 import BeautifulSoup

# Directory containing HTML and JSON files
directory_path = 'html_pages'

# Iterate through each file in the directory starting from the 113th file
for filename in sorted(os.listdir(directory_path)):
    #print(filename)
    
    if filename.endswith('.html'):
        # Construct full file path for HTML
        html_file_path = os.path.join(directory_path, filename)
        
        # Open and read the HTML file
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract content of divs by their class
        tip_description = soup.find("div", class_="tip-description").get_text(strip=True) if soup.find("div", class_="tip-description") else "No description found"
        tip_roles = soup.find("div", class_="tip-roles").get_text(strip=True) if soup.find("div", class_="tip-roles") else "No roles found"
        event_details = soup.find("div", class_="sched-event-details-timeandplace").get_text(strip=True) if soup.find("div", class_="sched-event-details-timeandplace") else "No event details found"
        event_type = soup.find("div", class_="sched-event-type").get_text(strip=True) if soup.find("div", class_="sched-event-type") else "No event type found"
        a_tag = soup.select_one('span[class^="event"] a')
        title = a_tag.get_text(strip=True) if a_tag else "No matching element found"

        # Construct file path for corresponding JSON file
        json_file_path = os.path.join('json', filename.replace('.html', '.json'))
        #print(json_file_path)
        
        # Load and update the JSON file
        if os.path.exists(json_file_path):
            
            with open(json_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            
            # Update 'description' field with new tip description
            data['description'] = tip_description
            
            # Save the updated JSON data back to the original JSON file
            with open(json_file_path, 'w', encoding='utf-8') as file:
                json.dump(data, file, indent=4)
        
        # Print extracted data
        #print(f"File: {filename}")
        #print("Title:", title)
        #print("Tip Description:", tip_description)
        #print("Tip Roles:", tip_roles)
        #print("Event Details:", event_details)
        #print("Event Type:", event_type)
        #print("-" * 40)  # Separator for readability between files
        #break


In [170]:
json_file_path

'json\\event_1N7ep_future-of-food-pitch-session.json'

In [171]:
#print(f"File: {filename}")
print("Title:", title)
print("Tip Description:", tip_description)
print("Tip Roles:", tip_roles)
print("Event Details:", event_details)
print("Event Type:", event_type)

Title: Future of Food Pitch Session
Tip Description: english version below 👇Welches Team nutzt seine Chance bei Rewe gelistet zu werden?Which team is taking advantage of its opportunity to be listed with Rewe?Wir freuen uns riesig,Selda Morinafür diese Session willkommen zu heißen, sie ist Innovationsmanagerin und leitet seit fünf Jahren die REWE Start-up-Lounge – sie bietet Food-Startups damit das Rundum-Sorglospaket für den Markteinstieg. Sie genießt die Arbeit mit den Start-ups, kennt die Logik großer Unternehmen und verfügt daher über die ideale Schnittstellenkompetenz für eine Vertrauensbasis auf Augenhöhe.🤝 Auf unserem Festival wird sie in der Jury der Pitchsession für Food-Startups sitzen. Seid live dabei, wenn Food Startups ihre Chance auf eine Rewe Listung bekommen! Das Gewinnerteam bekommt ein exklusives Coaching zu dem Thema: wie komme ich in den LEH und was brache ich für eine Listung bei REWE.//We are thrilled to welcome Selda Morina for this session, she is an innovation 