In [1]:
import os
import pandas as pd
import re
import unicodedata

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove accents
    text = ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove English characters
    text = re.sub(r'[a-zA-Z]', '', text)
    # Remove all special chars
    text = text.replace('_', '')
    return text

# Labeling function
def find_words_in_text(text, target_words):
    for word in target_words:
        if word.lower() in text:
            return 0  # If any target word is found, label as 0 (αποδοχή)
    return 1  # If none of the target words are found, label as 1 (απόρριψη)

# Main function to process CSV files
def label_csv_files_with_specific_words(csv_file_path, target_words, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load the CSV file
    df = pd.read_csv(csv_file_path)

    # Apply preprocessing to the 'text' column
    df['preprocessed_text'] = df['text'].apply(preprocess_text)

    # Dictionary to store labels
    labeled_files = {}

    # Label each row based on the preprocessed text
    for index, row in df.iterrows():
        prep_text = row['preprocessed_text']
        label = find_words_in_text(prep_text, target_words)
        labeled_files[f"Decision_{index}"] = label

    # Create a DataFrame from the labeled files
    df_labels = pd.DataFrame(labeled_files.items(), columns=['Decision', 'Label'])

    # Save the labeled DataFrame to a CSV file
    csv_file_path = os.path.join(output_folder, 'areiospagos_annotated.csv')
    df_labels.to_csv(csv_file_path, index=False, mode='w')

    return df_labels

# Folder paths and target words
input_file = './areiospagos_apofaseis.csv'
output_folder = './annotated_pagos'
target_words = ['δεχεται τυπικα και κατ ουσιαν', 'δεχεται κατα ενα μερος', 'αναιρει την', 'αναιρει τις', 'δεχεται τυπικα και ουσιαστικα', 'δεχεται τυπικα και κατουσιαν']

# Process the CSV files and label them
labeled_dataset = label_csv_files_with_specific_words(input_file, target_words, output_folder)


In [2]:
df_ann = pd.read_csv('./annotated_pagos/areiospagos_annotated.csv')

In [3]:
df_ann

Unnamed: 0,Decision,Label
0,Decision_0,1
1,Decision_1,0
2,Decision_2,1
3,Decision_3,1
4,Decision_4,1
...,...,...
2995,Decision_2995,1
2996,Decision_2996,1
2997,Decision_2997,1
2998,Decision_2998,1


In [4]:
df_ann['Label'].value_counts()

Label
1    2450
0     550
Name: count, dtype: int64

Creating a new balanced dataset to work with for Areios Pagos


In [6]:
df_0 = df_ann[df_ann['Label'] == 0]
df_1 = df_ann[df_ann['Label'] == 1]

df_0_sample = df_0.sample(500)
df_1_sample = df_1.sample(550)

df_resampled_ap = pd.concat([df_0_sample, df_1_sample])

# Shuffle the dataset
df_resampled_ap = df_resampled_ap.sample(frac=1).reset_index(drop=True)

# Check the distribution of labels
df_resampled_ap['Label'].value_counts()


Label
1    550
0    500
Name: count, dtype: int64

In [7]:
df_resampled_ap

Unnamed: 0,Decision,Label
0,Decision_506,0
1,Decision_994,0
2,Decision_2657,1
3,Decision_1756,1
4,Decision_1580,1
...,...,...
1045,Decision_1603,1
1046,Decision_2811,1
1047,Decision_707,1
1048,Decision_2852,1
