# ARFF to HDF5 Conversion


### This notebook processes ARFF files extracted from the Berlin Database of Emotional Speech using OpenSMILE and converts them into HDF5 format for training. It splits the data into training and testing sets.

##  Imports and Setup

In [2]:
import arff
import numpy as np
import scipy
import os
import pandas as pd
import chardet
import shutil
import subprocess
import shlex
import random
import h5py

arff_folder_path=""
silb_data_path=""
audio_data_path=""
save_folder_path=""

config_path=""
print(config_path)

/Users/henry/Desktop/Programming/Thesis/emotion_speech_classification.nosync/models/Open_Smile_Features/config/compare16/ComParE_2016.conf


## Dataset Creation Function

Iterates through ARFF files in a specified folder.
Extracts class labels from filenames (EmoDB convention).
Splits data into training (90%) and testing (10%) sets.
Reads ARFF data, handles missing values, and reshapes it.
Saves the features and labels into separate HDF5 files (train_*.h5 and test_*.h5).

In [None]:
def create_dataset_with_arff(dataset_name, folder_path):
    def get_class(filename):
        if "W" in filename[5]:
            return 0
        if "L" in filename[5]:
            return 1
        if "E" in filename[5]:
            return 2
        if "A" in filename[5]:
            return 3
        if "F" in filename[5]:
            return 4
        if "T" in filename[5]:
            return 5
        if "N" in filename[5]:
            return 6
    # assert dataset_name[-3:]!=".h5"
    list_files=os.listdir(folder_path)
    num_files = len(list_files)
    num_train = int(num_files * 0.9)  
    training_list = random.sample(list_files, num_train)
    testing_list = [file for file in list_files if file not in training_list]
    train_dataset = h5py.File(("train_"+dataset_name), mode="w", libver="latest")
    for file in training_list:
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path):
            if file_path[-4:]=="arff":
                with open(file_path, 'r') as _file:
                    arff_data = arff.load(_file)
                    data=np.array(arff_data['data'])                
                filename = str(file).replace(".arff", "")
                data_series = pd.Series(data[0]) 
                data_series = pd.to_numeric(data_series, errors='coerce') 
                data_series.fillna(0, inplace=True)
                print(data_series.shape)
                data_series=np.array(data_series).reshape(1,90)
                # data_series = np.array(data_series).reshape(1, -1)
                h5spec = train_dataset.create_dataset(filename, data=data_series)
                h5spec.attrs["class_label"] = get_class(filename)                    
    train_dataset.close()
    test_dataset = h5py.File(("test_"+dataset_name), mode="w", libver="latest")
    for file in testing_list:
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path):
            if file_path[-4:]=="arff":
                with open(file_path, 'r') as _file:
                    arff_data = arff.load(_file)
                    data=np.array(arff_data['data']) 
                data_series = pd.Series(data[0]) 
                data_series = pd.to_numeric(data_series, errors='coerce') 
                data_series.fillna(0, inplace=True)
                data_series=np.array(data_series).reshape(1,90)
                filename = file.replace(".arff", "")
                h5spec = test_dataset.create_dataset(filename, data=data_series)
                h5spec.attrs["class_label"] = get_class(filename) 
    test_dataset.close()
    print("Train and test dataset created")

## Execution

Sets up the file paths and calls the create_dataset_with_arff function to generate the datasets. It also handles the removal of existing HDF5 files to prevent conflicts.

In [None]:
folder_path="../data/EmoDB/arff_files/"

dataset=f"EmoDB_eGeMaps.h5"
trainset="train_"+dataset
testset="test_"+dataset

file_path1=os.path.join(os.getcwd(), trainset)
if os.path.isfile(file_path1):
    os.remove(file_path1)

file_path2=os.path.join(os.getcwd(), testset)
if os.path.isfile(file_path2):
    os.remove(file_path2)

create_dataset_with_arff(dataset, folder_path)