In [3]:
cd /content/drive/My Drive/GSoC2020/audio_visual_emotion/GSoC2020/

/content/drive/My Drive/GSoC2020/audio_visual_emotion/GSoC2020


In [22]:
import numpy as np
import os
import sys

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation
from keras.layers.convolutional import Conv2D
from keras.layers import LSTM, Input, Flatten, Embedding, Convolution1D,Dropout, concatenate
from keras.optimizers import SGD, Adam, RMSprop

from scipy import signal
from sklearn.preprocessing import label_binarize

import pickle
from features import *
from helper import *


## Loading data 

In [9]:
code_path = os.path.dirname(os.path.realpath(os.getcwd()))
emotions_used = np.array(['ang', 'exc', 'neu', 'sad'])
#data_path = code_path + "/../data/sessions/"
data_path = "../../Data/IEMOCAP/"
framerate = 16000

In [10]:
# Load IEMOCAP data
with open(data_path + '/../'+'data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [13]:
def calculate_features(frames, freq, options):
    window_sec = 0.2
    window_n = int(freq * window_sec)

    st_f = stFeatureExtraction(frames, freq, window_n, window_n / 2)

    if st_f.shape[1] > 2:
        i0 = 1
        i1 = st_f.shape[1] - 1
        if i1 - i0 < 1:
            i1 = i0 + 1
        
        deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
        for i in range(i0, i1):
            i_left = i - 1
            i_right = i + 1
            deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
        return deriv_st_f
    elif st_f.shape[1] == 2:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f
    else:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f

In [14]:
x_train_speech = []

counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    x_train_speech.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech = np.array(x_train_speech)
x_train_speech.shape

100
200
300
400
500
600
700
800
900


(950, 100, 34)

In [16]:
x_train_mocap = []
counter = 0
for ses_mod in data2:
    x_rot = ses_mod['mocap_rot']
    if(x_rot.shape != (200,165)):
        x_rot = np.zeros((200,165))   
    x_rot[np.isnan(x_rot)]=0
    x_train_mocap.append( x_rot )
    
x_train_mocap = np.array(x_train_mocap)
x_train_mocap = x_train_mocap.reshape(-1,200,165,1)
x_train_mocap.shape

(950, 200, 165, 1)

In [17]:
Y=[]
for ses_mod in data2:
    Y.append(ses_mod['emotion'])
    
Y = label_binarize(Y,emotions_used)

Y.shape

(950, 4)

## Models

In [31]:
model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(64))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.3))
model_speech.add(Dense(128))

model_mocap = Sequential()
model_mocap.add(Conv2D(32, 3, strides=(2, 2), border_mode='same', input_shape=(200, 165, 1)))
model_mocap.add(Dropout(0.3))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Flatten())
model_mocap.add(Dense(128))


model_combined = concatenate([model_speech.output, model_mocap.output], axis=-1)
model_combined = Activation('relu')(model_combined)
model_combined = Dense(256, activation='relu')(model_combined)
output = Dense(4, activation="softmax")(model_combined)

model = Model(inputs=[model_speech.input, model_mocap.input], outputs=output)


#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',optimizer='Adam' ,metrics=['acc'])

model_speech.summary()
model_mocap.summary()
model.summary()

print("Model1 Built")

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_12 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_24 (Dense)             (None, 64)                217664    
_________________________________________________________________
activation_33 (Activation)   (None, 64)                0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 128)               8320      
Total params: 225,984
Trainable params: 225,984
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_15"
_________________________________________________________________
Layer (type)            

  if __name__ == '__main__':
  if sys.path[0] == '':
  from ipykernel import kernelapp as app


In [32]:
hist = model.fit([x_train_speech, x_train_mocap], Y, 
                 batch_size=64, nb_epoch=50, verbose=1, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 760 samples, validate on 190 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [33]:
# This model is basically the combination of audio-only and visua-only model

model_speech = Sequential()
model_speech.add(Convolution1D(32, 3, border_mode='same', input_shape=(100, 34)))
model_speech.add(Dropout(0.2))
model_speech.add(Activation('relu'))
model_speech.add(Convolution1D(32, 3, border_mode='same'))
model_speech.add(Dropout(0.2))
model_speech.add(Activation('relu'))
model_speech.add(Flatten())
model_speech.add(Dropout(0.2))
model_speech.add(Dense(128))


model_mocap = Sequential()
model_mocap.add(Conv2D(32, 3, strides=(2, 2), border_mode='same', input_shape=(200, 165, 1)))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Flatten())
model_mocap.add(Dense(128))

model_combined = concatenate([model_speech.output, model_mocap.output], axis=-1)
model_combined = Activation('relu')(model_combined)
model_combined = Dense(256, activation='relu')(model_combined)
output = Dense(4, activation="softmax")(model_combined)

model = Model(inputs=[model_speech.input, model_mocap.input], outputs=output)


#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',optimizer='Adam' ,metrics=['acc'])

model_speech.summary()
model_mocap.summary()
model.summary()

print("Model1 Built")


  This is separate from the ipykernel package so we can avoid doing imports until
  
  from ipykernel import kernelapp as app


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 100, 32)           3296      
_________________________________________________________________
dropout_34 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
activation_38 (Activation)   (None, 100, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
dropout_35 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
activation_39 (Activation)   (None, 100, 32)           0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 3200)            

In [34]:
hist = model.fit([x_train_speech, x_train_mocap], Y, 
                 batch_size=64, nb_epoch=50, verbose=1, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 760 samples, validate on 190 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
!git status