In [1]:
import os
import copy
import glob
import pandas as pd

from IPython.display import display

In [2]:
data_path = 'D:/Projects/masters_diploma/data'

raw_data_path = os.path.join(data_path, 'raw')
processed_data_path = os.path.join(data_path, 'processed')
os.makedirs(processed_data_path, exist_ok=True)

emo_react_raw_path = os.path.join(raw_data_path, 'EmoReact')
emo_react_processed_path = os.path.join(processed_data_path, 'EmoReact')
os.makedirs(emo_react_processed_path, exist_ok=True)

labels_path = os.path.join(emo_react_raw_path, 'labels')
videos_path = os.path.join(emo_react_raw_path, 'videos')
visual_features_path = os.path.join(emo_react_raw_path, 'visual_features')

datasets_path = os.path.join(emo_react_processed_path, 'datasets')
os.makedirs(datasets_path, exist_ok=True)

In [3]:
def clean_string(input_string: str) -> str:
    # Replace double quotes with a placeholder
    cleaned_string = input_string.replace("''", "PLACEHOLDER")

    # Remove single quotes
    cleaned_string = cleaned_string.replace("'", "")

    # Replace placeholder with single quotes
    cleaned_string = cleaned_string.replace("PLACEHOLDER", "'")

    return cleaned_string

def combine_path_names(_name: str, _path: str) -> str:
    return os.path.abspath(os.path.join(_path, _name))

In [4]:
headers = ['Curiosity', 'Uncertainty', 'Excitement', 'Happiness',
           'Surprise', 'Disgust', 'Fear', 'Frustration', 'Valence']

types_of_data_to_process = ['videos', 'visual_features']

for file_path in glob.glob(os.path.join(emo_react_raw_path, '*_names.txt')):
    # file_path = 'D:/Projects/masters_diploma/data/raw/EmoReact/train_names.txt'
    df = pd.read_csv(file_path, header=None, names=['filename'])
    df['filename'] = df['filename'].apply(clean_string)

    labels_df = pd.read_csv(os.path.join(labels_path, os.path.basename(file_path).split("_")[0] + '_labels.txt'),
                            header=None, names=headers)

    concat_df = pd.concat([df, labels_df], axis=1)
    concat_df = concat_df.rename(columns={'filename': 'file_path'})
    for data_type in types_of_data_to_process:
        path_to_save = os.path.join(datasets_path, data_type)
        os.makedirs(path_to_save, exist_ok=True)

        df_type = copy.deepcopy(concat_df)

        if data_type == 'videos':
            df_type["file_path"] = df_type["file_path"].apply(
                combine_path_names,
                _path=os.path.join(videos_path, os.path.basename(file_path).split("_")[0])
            )
        else:
            df_type["file_path"] = df_type["file_path"].apply(
                combine_path_names,
                _path=os.path.join(visual_features_path, os.path.basename(file_path).split("_")[0])
            )

        df_type.to_csv(os.path.join(path_to_save, os.path.basename(file_path).split("_")[0] + ".csv"),
                       index=False)
