In [2]:
import os
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import re

# # Preprocessing
# def preprocess_text(ll: list) -> list:
#     ll = [word for word in ll if word != '']
#     ll = [word.lower() for word in ll]
#     ll = [word.replace('\xa0', ' ') for word in ll]
#     ll = [re.sub(r'[^\w\s]', '', word) for word in ll]
#     ll = [re.sub(r'\d+', '', word) for word in ll]
#     ll = [word for word in ll if word != '']
#     ll = [w for w in ll if not w.isascii()]  # Remove english words (I hope)
#     ll = [ud.normalize('NFD',l).translate(d) for l in ll] # remove accents , maybe use in annotate script 
#     return ll

def extract_text_from_pdf(file_path): #συνάρτηση για να χειριστώ pdf αρχεία
    with pdfplumber.open(file_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text.lower():
            return 1  # If any target word is found, label as 1
    return 0  # If none of the target words are found, label as 0

def label_files_with_specific_words(folder_path, target_words):
    labeled_files = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            text = BeautifulSoup(html_content, 'html.parser').get_text()
        elif filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip files that are neither HTML nor PDF
        
        label = find_words_in_text(text, target_words)
        labeled_files[filename] = label

    df = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])
    return df

# My folder of decisions :
folder_path = './all_together'
target_words = ['απορριπτει', 'επιβαλλει', 'καταδικαζει', 'υποχρεωνει']  # Λέξεις προς επισημείωση
labeled_dataset = label_files_with_specific_words(folder_path, target_words)

print(labeled_dataset)


                                  Decision  Label
0    ECLI_EL_COS_2010_0104N14.04E7108.html      1
1                                95658.pdf      0
2                                 570.html      1
3                                2797.html      1
4                                55127.pdf      0
5                                2715.html      1
6                                 789.html      0
7                                 588.html      0
8                                5696.html      0
9     ECLI_EL_COS_2010_0104N8.09ED144.html      1
10                               45864.pdf      1
11                               2237.html      1
12                                584.html      0
13                               82269.pdf      0
14                                592.html      1
15                                576.html      0
16                               1056.html      1
17                                599.html      1
18  ECLI_EL_COS_2010_0104N18.06ED1825.html      1


In [2]:
import os
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove accents
    text = ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text:
            return 1  # If any target word is found, label as 1
    return 0  # If none of the target words are found, label as 0

def label_files_with_specific_words(folder_path, target_words, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    labeled_files = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            text = BeautifulSoup(html_content, 'html.parser').get_text()
        elif filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip files that are neither HTML nor PDF
        
        preprocessed_text = preprocess_text(text)
        # Save preprocessed text to a new file in the output folder
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(preprocessed_text)
        
        label = find_words_in_text(preprocessed_text, target_words)
        labeled_files[filename] = label

    df = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])
    return df

# My folder of decisions:
input_folder = './all_together'
output_folder = './preprocessed'
target_words = ['απορριπτει', 'επιβαλλει', 'καταδικαζει', 'υποχρεωνει']  # Λέξεις προς επισημείωση
labeled_dataset = label_files_with_specific_words(input_folder, target_words, output_folder)

print(labeled_dataset)


                                  Decision  Label
0    ECLI_EL_COS_2010_0104N14.04E7108.html      1
1                                95658.pdf      0
2                                 570.html      1
3                                2797.html      1
4                                55127.pdf      0
5                                2715.html      1
6                                 789.html      1
7                                 588.html      1
8                                5696.html      1
9     ECLI_EL_COS_2010_0104N8.09ED144.html      1
10                               45864.pdf      1
11                               2237.html      1
12                                584.html      1
13                               82269.pdf      0
14                                592.html      1
15                                576.html      1
16                               1056.html      1
17                                599.html      1
18  ECLI_EL_COS_2010_0104N18.06ED1825.html      1
