In [9]:
import os
import re
from bs4 import BeautifulSoup

def remove_text_after_regex(input_html_file, output_html_file, regex_patterns):
    # Read the content of the input HTML file
    with open(input_html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Iterate over each regex pattern
    for pattern in regex_patterns:
        # Compile the regex pattern with re.UNICODE flag
        compiled_pattern = re.compile(pattern, re.UNICODE)

        # Find the first occurrence of the regex pattern
        match = compiled_pattern.search(html_content)
        if match:
            # Remove everything from the match position to the end of the HTML content
            html_content = html_content[:match.start()]
            break  # Break the loop after the first match

    # Write the modified HTML content to the output file
    with open(output_html_file, 'w', encoding='utf-8') as file:
        file.write(html_content)

def process_html_files(input_folder, output_folder, regex_patterns):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each HTML file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.html'):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)
            remove_text_after_regex(input_file_path, output_file_path, regex_patterns)

# Usage example
input_folder = 'all_together'
output_folder = 'final_dataset'
# Example Greek regular expression patterns (sentences)
regex_patterns = [
    r'ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ',
    r'Δ ι ά τ α ύ τ α'
]  # Replace these with your Greek regex patterns (sentences)
process_html_files(input_folder, output_folder, regex_patterns)

print("Text after the regex patterns have been removed from HTML files in the input folder and saved to new files in the output folder:", output_folder)


Text after the regex patterns have been removed from HTML files in the input folder and saved to new files in the output folder: final_dataset
