In [4]:
import os
import pandas as pd
import re
import unicodedata
from bs4 import BeautifulSoup

# Preprocessing functions (same as in your code)
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove accents
    text = ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove English characters
    text = re.sub(r'[a-zA-Z]', '', text)
    # Remove all special chars
    text = text.replace('_', '')
    return text

# Labeling function
def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text:
            return 0  # If any target word is found, label as 0 (αποδοχή)
    return 1  # If none of the target words are found, label as 1 (απόρριψη)

# Main function to process CSV files
def label_csv_files_with_specific_words(folder_path, target_words, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    labeled_files = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.csv'):
            # Read the CSV file
            df = pd.read_csv(file_path)
                
            # Preprocess the text
            preprocessed_text = preprocess_text(text)

                
            # Label the text
            label = find_words_in_text(preprocessed_text, target_words)
            labeled_files[output_file_name] = label

    # Create a DataFrame from the labeled files
    df_labels = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])
    
    # Save the labeled DataFrame to a CSV file
    csv_file_path = os.path.join(output_folder, 'areiospageos_annotated.csv')
    df_labels.to_csv(csv_file_path, index=False, mode='w')
    
    return df_labels

# Folder paths and target words
input_folder = './areiospagos_apofaseis.cvs'
output_folder = './annotated_pagos'
target_words = ['δεχεται τυπικα και κατ ουσιαν', 'δεχεται κατα ενα μερος', 'αναιρει την', 'αναιρει τις','δεχεται τυπικα και ουσιαστικα', 'δεχεται τυπικα και κατουσιαν']

# Process the CSV files and label them
labeled_dataset = label_csv_files_with_specific_words(input_folder, target_words, output_folder)
