In [4]:
import os
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove accents
    text = ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove English characters
    text = re.sub(r'[a-zA-Z]', '', text)
    # Remove all special chars
    text = text.replace('_', '')
    return text

def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text:
            return 0  # If any target word is found, label as 0 (αποδοχή)
    return 1  # If none of the target words are found, label as 1 (απόρριψη)

def label_files_with_specific_words(folder_path, target_words, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    labeled_files = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            text = BeautifulSoup(html_content, 'html.parser').get_text()
        elif filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip files that are neither HTML nor PDF
        
        preprocessed_text = preprocess_text(text)
        # Save preprocessed text to a new file in the output folder
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(preprocessed_text)
        
        label = find_words_in_text(preprocessed_text, target_words)
        labeled_files[filename] = label

    df = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])
    
    # Save DataFrame to CSV file
    csv_file_path = os.path.join(output_folder, 'annotated_efeteio.csv')
    df.to_csv(csv_file_path, index=False, mode='w')
    
    return df

# My folder of decisions:
input_folder = './efeteiopeiraia'
output_folder = './preprocessed_efeteio'
target_words = ['δεχεται τυπικα και κατ ουσιαν', 'δεχεται τυπικα και ουσιαστικα','δεχεται τυπικα και κατουσιαν']  # Λέξεις προς επισημείωση
labeled_dataset = label_files_with_specific_words(input_folder, target_words, output_folder)
