# Preprocessing of the training data
Reading the training data from ARFF-file and write it into numpy binaries which contain arrays in normalized format.

In [1]:
import arff
import random
import numpy as np
import tensorflow as tf

In [2]:
FEATURE_COUNT = 13 #number of features in each row
CORPUS_PATH = 'IEMOCAP_mfccFeatures.arff'

prev_wav = "" #previous file
current_wav = "" #current file
count_row = 0
count_wav = 0
count_frame = 0
parsing_step = 0

labels = []
features = []

In [3]:
def split_dataset(dataset):
    set_train = []
    set_valid = []
    set_test = []
    size_set_train = int(len(dataset) / 2)
    size_set_eval = int(size_set_train / 2)
    for i in range(len(dataset)):
        if i < size_set_train:
            set_train.append(dataset[i])
        elif i < size_set_train + size_set_eval:
            set_valid.append(dataset[i])
        elif i >= size_set_train + size_set_eval:
            set_test.append(dataset[i])
    return set_train, set_valid, set_test

In [4]:
def shuffle_dataset(set_features, set_labels):
    np_features = np.array(set_features)
    np_labels = np.array(set_labels)
    p = np.random.permutation(len(set_features))
    return np_features[p], np_labels[p]

In [5]:
def append_frame (row): #appends features and label of a frame to the corresponding lists.
    current_features = np.empty(FEATURE_COUNT)
    for r in range(1,FEATURE_COUNT):
        current_features[r] = row[r]
    # generates a normalized array but formatted as a list consisting of a single list ([normalized_current_features])
    # so we need to access the normalized current_features at index 0 to get the current_features array.
    features.append(tf.keras.utils.normalize(current_features)[0])
    one_hot_label = get_label(row)
    labels.append(one_hot_label)

In [6]:
def get_label(row): #encode label as a one hot vector
    label = row[FEATURE_COUNT + 1]
    if label == "0":
        return [1,0,0,0]
    elif label == "1":
        return [0,1,0,0]
    elif label == "2":
        return [0,0,1,0]
    elif label == "3":
        return [0,0,0,1]
    else:
        return [0,0,0,0]

In [7]:
def append_zero (): #append a 'zero frame'
    features.append([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    labels.append([labels[len(labels)-1]])

-- Begin preprocessing --

In [8]:
for current_row in arff.load(CORPUS_PATH): #iterate over data and count number of frames and files
    count_row += 1
    current_wav = current_row[0]
    if current_wav != prev_wav:
        count_wav += 1
        prev_wav = current_wav

In [9]:
#avg number of frames per file sample
avg_frames = int(count_row / count_wav)
print("avg_frames: " + str(avg_frames) + ", the other ones: " + str(count_row) + ", " + str(count_wav))
##print "Frames per file: " + str(avg_frames)

avg_frames: 444, the other ones: 4457391, 10039


In [10]:
#reset the values in current_wav and prev_wav before looping over the rows again
current_wav = ""
prev_wav = ""

for current_row in arff.load(CORPUS_PATH):
    count_frame += 1
    parsing_step += 1
    #simply to observe the progress
    if parsing_step % 1000 == 0:
        print(parsing_step)

    current_wav = current_row[0] #get name of current file
    if prev_wav == "": #sets initial prev_wav to current_wav
        prev_wav = current_wav

    if current_wav == prev_wav: #check if we still consider the same file
        if count_frame <= avg_frames: #append frame if below 'average frame lenght'
            append_frame(current_row)
        else: #ignore frame if we already appended >= 'average frame lenght' many frames of this file
            continue
    elif current_wav != prev_wav: #new file
        while count_frame <= avg_frames: #if we added less than 'average frame lenght' many frames for the previous file: fill with zeros
            append_zero()
            count_frame += 1
        append_frame(current_row) #append first frame of the new file
        prev_wav = current_wav
        count_frame = 1

    if parsing_step == count_row: #last file with less frames than average
        while count_frame < avg_frames: #if we added less than 'average frame lenght' many frames for the previous file: fill with zeros
            append_zero()
            count_frame += 1

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

1177000
1178000
1179000
1180000
1181000
1182000
1183000
1184000
1185000
1186000
1187000
1188000
1189000
1190000
1191000
1192000
1193000
1194000
1195000
1196000
1197000
1198000
1199000
1200000
1201000
1202000
1203000
1204000
1205000
1206000
1207000
1208000
1209000
1210000
1211000
1212000
1213000
1214000
1215000
1216000
1217000
1218000
1219000
1220000
1221000
1222000
1223000
1224000
1225000
1226000
1227000
1228000
1229000
1230000
1231000
1232000
1233000
1234000
1235000
1236000
1237000
1238000
1239000
1240000
1241000
1242000
1243000
1244000
1245000
1246000
1247000
1248000
1249000
1250000
1251000
1252000
1253000
1254000
1255000
1256000
1257000
1258000
1259000
1260000
1261000
1262000
1263000
1264000
1265000
1266000
1267000
1268000
1269000
1270000
1271000
1272000
1273000
1274000
1275000
1276000
1277000
1278000
1279000
1280000
1281000
1282000
1283000
1284000
1285000
1286000
1287000
1288000
1289000
1290000
1291000
1292000
1293000
1294000
1295000
1296000
1297000
1298000
1299000
1300000
1301000


2218000
2219000
2220000
2221000
2222000
2223000
2224000
2225000
2226000
2227000
2228000
2229000
2230000
2231000
2232000
2233000
2234000
2235000
2236000
2237000
2238000
2239000
2240000
2241000
2242000
2243000
2244000
2245000
2246000
2247000
2248000
2249000
2250000
2251000
2252000
2253000
2254000
2255000
2256000
2257000
2258000
2259000
2260000
2261000
2262000
2263000
2264000
2265000
2266000
2267000
2268000
2269000
2270000
2271000
2272000
2273000
2274000
2275000
2276000
2277000
2278000
2279000
2280000
2281000
2282000
2283000
2284000
2285000
2286000
2287000
2288000
2289000
2290000
2291000
2292000
2293000
2294000
2295000
2296000
2297000
2298000
2299000
2300000
2301000
2302000
2303000
2304000
2305000
2306000
2307000
2308000
2309000
2310000
2311000
2312000
2313000
2314000
2315000
2316000
2317000
2318000
2319000
2320000
2321000
2322000
2323000
2324000
2325000
2326000
2327000
2328000
2329000
2330000
2331000
2332000
2333000
2334000
2335000
2336000
2337000
2338000
2339000
2340000
2341000
2342000


3250000
3251000
3252000
3253000
3254000
3255000
3256000
3257000
3258000
3259000
3260000
3261000
3262000
3263000
3264000
3265000
3266000
3267000
3268000
3269000
3270000
3271000
3272000
3273000
3274000
3275000
3276000
3277000
3278000
3279000
3280000
3281000
3282000
3283000
3284000
3285000
3286000
3287000
3288000
3289000
3290000
3291000
3292000
3293000
3294000
3295000
3296000
3297000
3298000
3299000
3300000
3301000
3302000
3303000
3304000
3305000
3306000
3307000
3308000
3309000
3310000
3311000
3312000
3313000
3314000
3315000
3316000
3317000
3318000
3319000
3320000
3321000
3322000
3323000
3324000
3325000
3326000
3327000
3328000
3329000
3330000
3331000
3332000
3333000
3334000
3335000
3336000
3337000
3338000
3339000
3340000
3341000
3342000
3343000
3344000
3345000
3346000
3347000
3348000
3349000
3350000
3351000
3352000
3353000
3354000
3355000
3356000
3357000
3358000
3359000
3360000
3361000
3362000
3363000
3364000
3365000
3366000
3367000
3368000
3369000
3370000
3371000
3372000
3373000
3374000


4276000
4277000
4278000
4279000
4280000
4281000
4282000
4283000
4284000
4285000
4286000
4287000
4288000
4289000
4290000
4291000
4292000
4293000
4294000
4295000
4296000
4297000
4298000
4299000
4300000
4301000
4302000
4303000
4304000
4305000
4306000
4307000
4308000
4309000
4310000
4311000
4312000
4313000
4314000
4315000
4316000
4317000
4318000
4319000
4320000
4321000
4322000
4323000
4324000
4325000
4326000
4327000
4328000
4329000
4330000
4331000
4332000
4333000
4334000
4335000
4336000
4337000
4338000
4339000
4340000
4341000
4342000
4343000
4344000
4345000
4346000
4347000
4348000
4349000
4350000
4351000
4352000
4353000
4354000
4355000
4356000
4357000
4358000
4359000
4360000
4361000
4362000
4363000
4364000
4365000
4366000
4367000
4368000
4369000
4370000
4371000
4372000
4373000
4374000
4375000
4376000
4377000
4378000
4379000
4380000
4381000
4382000
4383000
4384000
4385000
4386000
4387000
4388000
4389000
4390000
4391000
4392000
4393000
4394000
4395000
4396000
4397000
4398000
4399000
4400000


In [11]:
#reduce labels to number of samples
labels_reduced = []
labels_reduced.append(labels[0])
for i in range(1, count_wav):
    labels_reduced.append(labels[i*avg_frames])

In [12]:
#generate suitable feature array for shuffling
features_clustered = np.array(features)
features_clustered = np.reshape(features_clustered, (-1, avg_frames, len(features[0])))

#shuffle arrays
features_clustered, labels_reduced = shuffle_dataset(features_clustered, labels_reduced)

In [13]:
#split into training, validation and test set
features_train, features_valid, features_test = split_dataset(features_clustered)
labels_train, labels_valid, labels_test = split_dataset(labels_reduced)

In [14]:
#write features back in proper format
features_train = np.reshape(features_train, (-1, len(features[0])))
features_valid = np.reshape(features_valid, (-1, len(features[0])))
features_test = np.reshape(features_test, (-1, len(features[0])))

In [15]:
print("Frames per sample: " + str(avg_frames))
np.save(CORPUS_PATH[:5]+'_feature_train', features_train)
np.save(CORPUS_PATH[:5]+'_feature_valid', features_valid)
np.save(CORPUS_PATH[:5]+'_feature_test', features_test)
np.save(CORPUS_PATH[:5]+'_labels_train', labels_train)
np.save(CORPUS_PATH[:5]+'_labels_valid', labels_valid)
np.save(CORPUS_PATH[:5]+'_labels_test', labels_test)
print("Preprocessing complete")

Frames per sample: 444
Preprocessing complete
