In [2]:
import matplotlib.pyplot as plt
import librosa.display as disp
import numpy as np

from data_utils.data_loader import Data_loader, EMO_DICT

import tensorflow as tf
from tensorflow.keras import layers


from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load data

In [3]:
dl= Data_loader()
data = dl.load_data()
len(data)

1440

In [4]:
X_samples = []
X_mfcc = []
X_chroma = []
X_mel = []
Y = []
EMO_DICT= {0:'neutral', 1:'calm', 2:'happy', 3:'sad', 4:'angry', 5:'fearful', 6:'disgust', 7:'surprised'}

# Full features
observed_emotions='neutral','calm','happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'}
N_observed = len(observed_emotions)
NR_TO_NR = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7}

# 4 features
# observed_emotions=['calm', 'happy', 'fearful', 'disgust']
# N_observed = len(observed_emotions)
# NR_TO_NR = {1:0, 2:1, 5:2, 6:3}

for d in data:
    emot_nr = np.argmax(d['emotion'])
    if EMO_DICT[emot_nr] in observed_emotions:

        X_samples.append(d['samples'])
        X_mfcc.append(d['mfcc'])
        X_chroma.append(d['chroma'])
        X_mel.append(d['mel'])
       
        Y.append(np.eye(N_observed, dtype=np.int32)[NR_TO_NR[emot_nr]-1])


In [5]:
for mfcc, mel, chroma in zip(X_mfcc,X_mel,X_chroma):
    print(mfcc.shape, mel.shape, chroma.shape)
    break


(117, 40) (117, 128) (117, 12)
(110, 40) (110, 128) (110, 12)
(115, 40) (115, 128) (115, 12)
(107, 40) (107, 128) (107, 12)
(106, 40) (106, 128) (106, 12)
(115, 40) (115, 128) (115, 12)
(107, 40) (107, 128) (107, 12)
(110, 40) (110, 128) (110, 12)
(101, 40) (101, 128) (101, 12)
(108, 40) (108, 128) (108, 12)
(99, 40) (99, 128) (99, 12)
(99, 40) (99, 128) (99, 12)
(104, 40) (104, 128) (104, 12)
(92, 40) (92, 128) (92, 12)
(113, 40) (113, 128) (113, 12)
(100, 40) (100, 128) (100, 12)
(116, 40) (116, 128) (116, 12)
(100, 40) (100, 128) (100, 12)
(94, 40) (94, 128) (94, 12)
(101, 40) (101, 128) (101, 12)
(101, 40) (101, 128) (101, 12)
(102, 40) (102, 128) (102, 12)
(103, 40) (103, 128) (103, 12)
(95, 40) (95, 128) (95, 12)
(108, 40) (108, 128) (108, 12)
(100, 40) (100, 128) (100, 12)
(99, 40) (99, 128) (99, 12)
(101, 40) (101, 128) (101, 12)
(107, 40) (107, 128) (107, 12)
(99, 40) (99, 128) (99, 12)
(102, 40) (102, 128) (102, 12)
(107, 40) (107, 128) (107, 12)
(102, 40) (102, 128) (102, 1

# 1DCNN + LSTM direct on AUDIO(samples) 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_samples, Y, test_size=0.2)

# transform lists to numpy arrays
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

# pad
X_train = np.expand_dims( tf.keras.preprocessing.sequence.pad_sequences(X_train) ,axis = 2)
X_test = np.expand_dims(  tf.keras.preprocessing.sequence.pad_sequences(X_test, X_train.shape[1])         , axis = 2)

In [7]:
print(X_train.shape, X_test.shape)

(614, 84351, 1) (154, 84351, 1)


In [8]:
def LFLB1D(input_shape=None):
    lflb = tf.keras.Sequential()

    if input_shape is None:
        lflb.add(layers.Conv1D(64,kernel_size=4))
    else:
        lflb.add(layers.Conv1D(64,kernel_size=4, input_shape=input_shape))

    lflb.add(layers.BatchNormalization())
    lflb.add(layers.Activation(activation = tf.keras.activations.elu))
    lflb.add(layers.MaxPool1D(pool_size=3, strides=3))

    return lflb

def LSTM():
    return layers.LSTM(256)

In [9]:
model = tf.keras.Sequential([LFLB1D(input_shape=(X_train.shape[1],1)), LFLB1D(),LFLB1D()])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      (None, 28116, 64)         576       
_________________________________________________________________
sequential_1 (Sequential)    (None, 9371, 64)          16704     
_________________________________________________________________
sequential_2 (Sequential)    (None, 3122, 64)          16704     
Total params: 33,984
Trainable params: 33,600
Non-trainable params: 384
_________________________________________________________________


In [10]:
example = np.expand_dims(np.expand_dims(X_train[1],axis=0),axis=2)
print("Input shape:", example.shape)

print("Output shape:", model(example).shape)

Input shape: (1, 84351, 1, 1)
Output shape: (1, 3122, 64)


In [11]:
audio_model = tf.keras.Sequential()

audio_model.add(LFLB1D(input_shape=(X_train.shape[1],1)))

for i in range(3):
    audio_model.add(LFLB1D())

audio_model.add(LSTM())
audio_model.add(layers.Dense(len(observed_emotions)))


audio_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_5 (Sequential)    (None, 28116, 64)         576       
_________________________________________________________________
sequential_6 (Sequential)    (None, 9371, 64)          16704     
_________________________________________________________________
sequential_7 (Sequential)    (None, 3122, 64)          16704     
_________________________________________________________________
sequential_8 (Sequential)    (None, 1039, 64)          16704     
_________________________________________________________________
lstm (LSTM)                  (None, 256)               328704    
_________________________________________________________________
dense (Dense)                (None, 4)                 1028      
Total params: 380,420
Trainable params: 379,908
Non-trainable params: 512
______________________________________________

In [14]:
opt = tf.keras.optimizers.Adam(lr=0.003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.1)
#opt = tf.keras.optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
audio_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              optimizer=opt,
              metrics=['accuracy'])

In [13]:
_=audio_model.fit(X_train, y_train, batch_size=1, epochs=1, validation_data=(X_test, y_test), shuffle=True)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

# 2DCNN + LSTM on Mel 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_mel, Y, test_size=0.2)

# transform lists to numpy arrays
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

# pad
X_train = np.expand_dims( tf.keras.preprocessing.sequence.pad_sequences(X_train) ,axis = 3)
X_test = np.expand_dims(  tf.keras.preprocessing.sequence.pad_sequences(X_test, X_train.shape[1])         , axis = 3)

In [10]:
X_train.shape

(614, 165, 128, 1)

In [12]:
def LFLB2D(nr_filters, pool, input_shape=None):
    lflb = tf.keras.Sequential()

    if input_shape is None:
        lflb.add(layers.Conv2D(nr_filters,kernel_size=(3,3)))
    else:
        lflb.add(layers.Conv2D(nr_filters,kernel_size=(3,3), input_shape=input_shape))

    lflb.add(layers.BatchNormalization())
    lflb.add(layers.Activation(activation = tf.keras.activations.elu))
    lflb.add(layers.MaxPool2D(pool_size=(pool,pool), strides=(pool,pool)))

    return lflb

In [29]:
filters = [64, 64, 128, 128]
pools = [2, 4, 4, 4]

model = tf.keras.Sequential()
model.add(LFLB2D(nr_filters=filters[0], pool=pools[0] , input_shape=(X_train.shape[1],X_train.shape[2],1)))

for nr_filters, pool in zip(filters[2:], pools[2:]):
    model.add(LFLB2D(nr_filters=nr_filters, pool=pool))

model.add( layers.Reshape((-1,128)) )
model.add(LSTM())
model.add(layers.Dense(len(observed_emotions)))

model.summary()

Model: "sequential_45"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_46 (Sequential)   (None, 81, 63, 64)        896       
_________________________________________________________________
sequential_47 (Sequential)   (None, 19, 15, 128)       74368     
_________________________________________________________________
sequential_48 (Sequential)   (None, 4, 3, 128)         148096    
_________________________________________________________________
reshape_2 (Reshape)          (None, None, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 4)                 1028      
Total params: 618,628
Trainable params: 617,988
Non-trainable params: 640
_____________________________________________

In [121]:
example = np.expand_dims(np.expand_dims( X_train[1] ,axis=0),axis=2)
print("Input shape:", example.shape)
print("Output shape:", model(example).shape)



Input shape: (1, 84351, 1)
Output shape: (1, 3122, 64)


In [114]:
opt = tf.keras.optimizers.Adam(lr=0.003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.1)
#opt = tf.keras.optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              optimizer=opt,
              metrics=['accuracy'])

TensorShape([1, 84348, 64])

In [1]:
_=model.fit(X_train, y_train, batch_size=1, epochs=1, validation_data=(X_test, y_test), shuffle=True)

NameError: name 'model' is not defined