In [1]:
import os
import pandas as pd
import collections
import json
import numpy as np
import csv
import re

In [2]:
def column_extraction(filepath):
    df = pd.read_csv(filepath)
    segment_id = filepath[-9:-8]

    eval_df = df[['Segment ID', 'Total Evaluation']].drop([0], axis=0)
    eval_df = eval_df[~eval_df['Segment ID'].str.contains(segment_id)]
    eval_df = eval_df[~eval_df['Total Evaluation'].str.contains(';')] 
    
    eval_df.reset_index(drop=True, inplace=True)
    indices = np.arange(len(eval_df))
    np.random.shuffle(indices)

    return eval_df, indices

In [3]:
def clean_file_content(file_content):
    unwanted_chars = 'c/n/N/u/l/b/s/o/*+/()\"'
    for char in unwanted_chars:
        file_content = file_content.replace(char, '')

    file_content = re.sub(' +', ' ', file_content)

    return file_content

In [4]:
def process_emotion(dataframe, dictionary, idx_array):
    for i in idx_array:
        segment_id = dataframe.iloc[i, 0]
        total_eval = dataframe.iloc[i, 1]
        
        ## Path setting is required.
        PATH = f"./KEMDy19/wav/Session{segment_id[4:6]}/{segment_id[:-5]}/{segment_id}"
        text_path = PATH + ".txt"
        wav_path = PATH + ".wav"
        
        if not os.path.exists(text_path):
            continue
            
        with open(text_path, 'r') as file:
            file_content = file.read() 
            file_content = clean_file_content(file_content)
            
        emotions = ['fear', 'surprise', 'angry', 'sad', 'neutral', 'happy', 'disgust']
        if total_eval in emotions:
            label = emotions.index(total_eval)
            dictionary[segment_id] = dict(Emotion=total_eval, Label=label, WavPath=wav_path, Text=file_content[:-1])
        else:
            pass

In [5]:
def save_json(dictionary, file_name):
    json_data = json.dumps(dictionary, ensure_ascii=False)
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(json_data)
        
def json_to_csv(json_file, csv_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        
        writer.writerow(['ID', 'Emotion', 'Label', 'WavPath', 'Text'])
        for key, value in data.items():
            row = [key, value['Emotion'], value['Label'], value['WavPath'], value['Text']]
            writer.writerow(row)

In [6]:
def main():
    train_final = dict()
    test_final = dict()
    
    ## Path setting is required.
    annotation_root = f"./KEMDy19/annotation"
    csv_files = os.listdir(annotation_root)

    for csv_file in csv_files:
        csv_path = annotation_root + '/' + csv_file
        dataframe, idx_array = column_extraction(csv_path)

        train_indices = idx_array[:int(len(dataframe) * 0.8)]
        test_indices = idx_array[int(len(dataframe) * 0.8):]

        process_emotion(dataframe, train_final, train_indices)
        process_emotion(dataframe, test_final, test_indices)
    
    save_json(train_final, './train.json')
    save_json(test_final, './test.json')

    json_to_csv('./train.json', './train.csv')
    json_to_csv('./test.json', './test.csv')

if __name__ == "__main__":
    main()