In [1]:
import pandas as pd
import numpy as np
import os
import sys
import re

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import manual_labeler_functions as man_lab_fun, general_functions as gf, files_paths as fp

In [2]:
MAIN_LIST_LABELER = gf.find_files_in_all_subdirectories([fp.DATASET_LOCAL, fp.DATASET_YT], fp.VD_LABELED_L0)
num_file_sequences = len(gf.find_files_in_all_subdirectories([os.path.join('.', 'result_sequences')], '*.CSV'))

In [3]:
gf.find_files_in_all_subdirectories([os.path.join('.', 'result_sequences')], '*.CSV')

['.\\result_sequences\\DATASET_SEQUENCES_2.CSV',
 '.\\result_sequences\\DATASET_SEQUENCES_3.CSV',
 '.\\result_sequences\\DATASET_SEQUENCES_6.CSV']

In [4]:
def extract_decimal_value(text):
    match = re.search(r'[-+]?\d*\.\d+', text)
    if match:
        return float(match.group())
    return None

In [5]:
def check_decimal_values(group):
    label_measures = group['label_measures']
    
    first_value = extract_decimal_value(label_measures.iloc[0])
    all_equal = True
    
    for i in range(1, len(label_measures)):
        current_value = extract_decimal_value(label_measures.iloc[i])
        
        if current_value != first_value and str(first_value) not in str(label_measures.iloc[i]):
            all_equal = False

In [6]:
OUTPUT_NAME = os.path.join('result_sequences','DATASET_SEQUENCES_' + str(num_file_sequences + 1) + '.CSV')
DATASET_SEQUENCES = pd.DataFrame()
idx_sample = 0
for current_path_location in MAIN_LIST_LABELER:
    VD_LABELED_DT = pd.read_csv(current_path_location)
    VD_LABELED_DT.drop(columns=['Unnamed: 0'], inplace=True)
    labels = man_lab_fun.GET_ALL_CLASSES(VD_LABELED_DT)
    print(f"Labels: {labels}, Current file path: {current_path_location}")
    final_df = pd.DataFrame()
    for label in labels[:1]:
        
        pattern = fr"'{label}'"
        filtered_df = VD_LABELED_DT[VD_LABELED_DT['label_measures'].str.contains(pattern, regex=True)]
        filtered_df = filtered_df.reset_index()
        filtered_df['index_diff'] = filtered_df['index'].diff()

        break_points = filtered_df.index[filtered_df['index_diff'] > 1].tolist()
        break_points = [0] + break_points + [len(filtered_df)]

        filtered_df = filtered_df.drop(columns=['index', 'time_seconds'])
        new_column = pd.Series([np.nan] * len(filtered_df))
        filtered_df.insert(loc=1, column='sample_id', value=new_column)

        # Extraímos os números decimais das strings
        number_1 = extract_decimal_value(str(filtered_df['label_measures'].loc[0]))
        number_2 = extract_decimal_value(str(filtered_df['label_measures'].loc[len(filtered_df['label_measures']) - 1]))

        grouped_samples = []
        for idx in range(len(break_points)-1):
            group = filtered_df.iloc[break_points[idx]:break_points[idx + 1]].copy()

            group['sample_id'] = idx_sample
            for line in group['label_measures']:
                label_dict = eval(line)
                keys = list(label_dict.keys())
                if len(keys) > 1:
                    best_key = keys.pop(0)
                    for key in keys:
                        if label_dict[key][1] < label_dict[best_key][1]:
                            best_key = key
                            break
                else: 
                    best_key = keys[0]
            
            if best_key == label:
                label_dict = eval(group['label_measures'].iloc[0])
                group['label'] = label
                group['seed_name'] = int(label_dict[label][2])
                grouped_samples.append(group)
                idx_sample+=1
        
        if len(grouped_samples) > 0:
            converted = pd.concat(grouped_samples).reset_index(drop=True).drop(columns=['index_diff'])
            final_df = pd.concat([final_df, converted]).reset_index(drop=True)
    
    DATASET_SEQUENCES = pd.concat([DATASET_SEQUENCES, final_df]).reset_index(drop=True)

print(f'\nSaving file {OUTPUT_NAME}...')
DATASET_SEQUENCES.to_csv(OUTPUT_NAME)

Labels: ['happy', 'neutral'], Current file path: ..\Dataset\DD-Local\VD_D_0000000001\VD_LABELED_L0.CSV
Labels: ['neutral', 'happy'], Current file path: ..\Dataset\DD-Local\VD_D_0000000002\VD_LABELED_L0.CSV
Labels: ['neutral', 'happy'], Current file path: ..\Dataset\DD-Local\VD_D_0000000003\VD_LABELED_L0.CSV
Labels: ['neutral', 'happy'], Current file path: ..\Dataset\DD-Local\VD_D_0000000004\VD_LABELED_L0.CSV
Labels: ['neutral'], Current file path: ..\Dataset\DD-Local\VD_D_0000000005\VD_LABELED_L0.CSV
Labels: ['neutral', 'happy'], Current file path: ..\Dataset\DD-Local\VD_D_0000000006\VD_LABELED_L0.CSV
Labels: ['happy'], Current file path: ..\Dataset\DD-Local\VD_D_0000000007\VD_LABELED_L0.CSV
Labels: ['happy', 'neutral'], Current file path: ..\Dataset\DD-Local\VD_D_0000000008\VD_LABELED_L0.CSV
Labels: ['happy', 'neutral'], Current file path: ..\Dataset\DD-Local\VD_D_0000000009\VD_LABELED_L0.CSV
Labels: ['neutral'], Current file path: ..\Dataset\DD-Local\VD_D_0000000010\VD_LABELED_L0.CSV