In [7]:
import os
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove accents
    text = ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text:
            return 1  # If any target word is found, label as 1
    return 0  # If none of the target words are found, label as 0

def label_files_with_specific_words(folder_path, target_words, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    labeled_files = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            text = BeautifulSoup(html_content, 'html.parser').get_text()
        elif filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip files that are neither HTML nor PDF
        
        preprocessed_text = preprocess_text(text)
        # Save preprocessed text to a new file in the output folder
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(preprocessed_text)
        
        label = find_words_in_text(preprocessed_text, target_words)
        labeled_files[filename] = label

    df = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])
    
    # Save DataFrame to CSV file
    csv_file_path = os.path.join(output_folder, 'annotated_dataset.csv')
    df.to_csv(csv_file_path, index=False, mode='w')
    
    return df

# My folder of decisions:
input_folder = './all_together'
output_folder = './preprocessed'
target_words = ['απορριπτει', 'επιβαλλει', 'καταδικαζει', 'υποχρεωνει']  # Λέξεις προς επισημείωση
labeled_dataset = label_files_with_specific_words(input_folder, target_words, output_folder)

print(labeled_dataset)


                                 Decision  Label
0   ECLI_EL_COS_2010_0104N14.04E7108.html      1
1                          2498_2023.html      1
2                               95658.pdf      0
3                          2585_2023.html      0
4                                570.html      1
..                                    ...    ...
69                              93144.pdf      0
70                               590.html      1
71                               601.html      1
72                               586.html      1
73                              5460.html      1

[74 rows x 2 columns]
