In [1]:
cd /content/drive/My Drive/GSoC2020/audio_visual_emotion/GSoC2020/

/content/drive/My Drive/GSoC2020/audio_visual_emotion/GSoC2020


In [14]:
import numpy as np
import os
import sys

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation
from keras.layers import LSTM, Input, Flatten, Embedding, Convolution1D,Dropout, Concatenate
from keras.optimizers import SGD, Adam, RMSprop

from scipy import signal
from sklearn.preprocessing import label_binarize

import pickle
from features import *
from helper import *


## Loading data 

In [3]:
code_path = os.path.dirname(os.path.realpath(os.getcwd()))
emotions_used = np.array(['ang', 'exc', 'neu', 'sad'])
#data_path = code_path + "/../data/sessions/"
data_path = "../../Data/IEMOCAP/"
framerate = 16000

In [4]:
# Load IEMOCAP data
with open(data_path + '/../'+'data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [7]:
# TODO: understand the processing
# 
fs = 16e3 # Framerate is 16000
f, t, Sxx = signal.spectrogram(data2[900]['signal'], fs, nperseg=400)
print(data2[900]['signal'].shape)
print(Sxx.shape)
Sxx, _ = pad_sequence_into_array(Sxx, maxlen=300) # Crop a fixed size from the audio signal
Sxx.shape # 

(150240,)
(201, 429)


(201, 300)

In [8]:
x_train_speech = []
fs = 16e3
counter = 0
for ses_mod in data2:
    x_speech = ses_mod['signal']
    f, t, Sxx = signal.spectrogram(x_speech, fs, nperseg=400)
    Sxx, _ = pad_sequence_into_array(Sxx, maxlen=300)
    x_train_speech.append(Sxx)
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech = np.array(x_train_speech)
x_train_speech.shape # (numberOfSample, )

100
200
300
400
500
600
700
800
900


(950, 201, 300)

In [11]:
Y=[]
for ses_mod in data2:
    Y.append(ses_mod['emotion'])
    
Y = label_binarize(Y,emotions_used)

Y.shape

(950, 4)

## Training Audio-only models

In [15]:
# Since here we only use Session1, small network is used
# There is only temporal info, so conv1D is used

model = Sequential()
model.add(Convolution1D(32, 3, border_mode='same', input_shape=(201, 300)))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Convolution1D(32, 3, border_mode='same'))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Activation('relu')) 
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

model.summary()

  after removing the cwd from sys.path.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 201, 32)           28832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 201, 32)           0         
_________________________________________________________________
activation_1 (Activation)    (None, 201, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 201, 32)           3104      
_________________________________________________________________
dropout_2 (Dropout)          (None, 201, 32)           0         
_________________________________________________________________
activation_2 (Activation)    (None, 201, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6432)             

  import sys


In [33]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=50, verbose=1, shuffle = True, 
                 validation_split=0.2)


  This is separate from the ipykernel package so we can avoid doing imports until


Train on 760 samples, validate on 190 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [16]:
def lstm_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(201, 300)))
    model.add(LSTM(256, return_sequences=False))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

model = lstm_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 201, 128)          219648    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               131584    
_________________________________________________________________
activation_5 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 2052      
_________________________________________________________________
activation_6 (Activation)    (None, 4)                 0         
Total params: 747,524
Trainable params: 747,524
Non-trainable params: 0
________________________________________________

In [17]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=40, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 760 samples, validate on 190 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [18]:
def calculate_features(frames, freq, options):
    window_sec = 0.2
    window_n = int(freq * window_sec)

    st_f = stFeatureExtraction(frames, freq, window_n, window_n / 2)

    if st_f.shape[1] > 2:
        i0 = 1
        i1 = st_f.shape[1] - 1
        if i1 - i0 < 1:
            i1 = i0 + 1
        
        deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
        for i in range(i0, i1):
            i_left = i - 1
            i_right = i + 1
            deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
        return deriv_st_f
    elif st_f.shape[1] == 2:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f
    else:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f

In [22]:
x_train_feat = []

counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    x_train_feat.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_feat = np.array(x_train_feat)
x_train_feat.shape

100
200
300
400
500
600
700
800
900


(950, 100, 34)

In [None]:
def linear_model_combined(optimizer='Adam'):
    modela = Sequential()
    modela.add(Flatten(input_shape=(100, 34)))
    modela.add(Dense(1024))
    modela.add(Activation('relu'))
    modela.add(Dense(256))
    
    modelb = Sequential()
    #model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
    modelb.add(Convolution1D(256, 3, border_mode='same', input_shape=(201, 300)))
    modelb.add(Dropout(0.2))
    modelb.add(Activation('relu'))
    modelb.add(Convolution1D(128, 3, border_mode='same'))
    modelb.add(Dropout(0.2))
    modelb.add(Activation('relu'))
    modelb.add(Convolution1D(64, 3, border_mode='same'))
    modelb.add(Dropout(0.2))
    modelb.add(Activation('relu'))
    modelb.add(Convolution1D(32, 3, border_mode='same'))
    modelb.add(Dropout(0.2))
    modelb.add(Activation('relu'))
    modelb.add(Flatten())
    modelb.add(Dense(256))



    model_combined = concatenate([modela.output, modelb.output], axis=-1)
    model_combined = Dense(4, activation='relu')(model_combined)
    output = Dense(4, activation="softmax")(model_combined)
    
    model = Model(inputs=[modela.input, modelb.input], outputs=output)

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model