In [None]:
!pip install librosa
!pip install seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras

In [None]:
import os

In [None]:
mkdir saved_models

In [None]:
# Set the directory where the spectrogram data resides
data_dir = '/mel2dot6/'

In [None]:
from sklearn.utils import shuffle
dir_listing = [f for f in os.listdir(data_dir) if '.pkl' not in f]
dir_listing = shuffle(dir_listing)

In [None]:
specgrams = [data_dir + x for x in dir_listing]

In [None]:
specgrams[0:1]

In [None]:
from keras.preprocessing import image
import PIL
from sklearn.utils import shuffle

In [None]:
def img_path_to_tensor(img_path):
    img = image.load_img(img_path)
    img = img.crop((3, 3, img.width-3, img.height-4))

    x = image.img_to_array(img)

    x = np.expand_dims(x, axis=0)

    return np.asarray(x, np.uint8)

In [None]:
# extract the speaker IDs from the file names for use in k-fold and LOSO evaluation 
str_index = str.index(specgrams[0][1:],'/') + 1
print(str_index)
kfold_groups = [x[str_index+1:str_index+3] for x in specgrams]
np.unique(kfold_groups)

In [None]:
list_of_tensors = [img_path_to_tensor(x) for x in specgrams]

In [None]:
list_of_tensors[0].shape

In [None]:
# Plot a spectrogram to ensure the file has been read in correctly
fig = plt.figure()
plt.imshow(np.asarray(list_of_tensors[0][0], dtype=np.uint8))
fig.tight_layout()
plt.show()

In [None]:
plt.close()

In [None]:
list_of_tensors = np.vstack(list_of_tensors)

In [None]:
list_of_tensors.shape

In [None]:
# Get the emotion label character from the filename
label_start_index = str_index + 6

In [None]:
# Display the emotion label character from the filename to ensure it's correctly found
specgrams[0][label_start_index:label_start_index+1]

In [None]:
# One-hot encoding of the emotion labels; mapping from German to English emotion labels

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

labels = [x[label_start_index:label_start_index+1] for x in specgrams]
eng_dict = {'W': 'Anger', 'L': 'Boredom', 'E': 'Disgust', 'A': 'Fear', 'F': 'Happy', 'T': 'Sad', 'N': 'Neutral'}
labels = [eng_dict[x] for x in labels]

encoder = LabelEncoder()
Y = encoder.fit_transform(labels).reshape(-1, 1)
print(Y.shape)

num_classes = len(np.unique(Y))
Y = keras.utils.to_categorical(Y, num_classes)
print(Y.shape)

In [None]:
X = list_of_tensors.astype('float32')/255
list_of_tensors = None

In [None]:
X.shape

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
def compile_model(model):
  import keras.optimizers
  opt = keras.optimizers.adam(lr=0.00075)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
  
  return model


In [None]:
def create_callbacks(checkpoint_verbosity=1):
  from keras.callbacks import ModelCheckpoint, EarlyStopping

  checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.from_scratch.hdf5',
                                verbose=checkpoint_verbosity, save_best_only=True)

  early_stopping = EarlyStopping(monitor='val_loss', patience=5)
  
  return [checkpointer, early_stopping]


In [None]:
def summarise_train_val_split():
  # (x_train, x_test) = X[:450], X[450:]
  # (y_train, y_test) = Y[:450], Y[450:]

  print(x_train.shape[0], 'X train samples')
  print(x_test.shape[0], 'X test samples')
  print(y_train.shape[0], 'Y train samples')
  print(y_test.shape[0], 'Y test samples')

  print(num_classes, 'classes')

In [None]:
from sklearn.model_selection import train_test_split, GroupKFold, LeaveOneGroupOut, KFold

In [None]:
from sklearn.metrics import precision_recall_fscore_support, recall_score, precision_score, confusion_matrix, accuracy_score, classification_report

In [None]:
# Uncomment the lines as required to choose the groupings for LOSO/k-fold cross-val
# Default is XBOW

# XBOW-style split (test set is all speakers with ID <= 10)
groupings = np.unique([0 if int(x) <=10 else 1 for x in kfold_groups], return_index=True, return_inverse=True)

# Full speaker=independent split (10 speakers)
# groupings = np.unique(kfold_groups, return_index=True, return_inverse=True)
groupings

In [None]:
# Speaker-independent
cv_groups = groupings[2]
cv = LeaveOneGroupOut()

# Speaker-dependent
# cv_groups=None
# cv = KFold(n_splits=5, shuffle=True)

In [None]:
def display_learning_curves(history):
  plt.plot(history.history['acc'])
  plt.plot(history.history['val_acc'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'validation'], loc='upper left')
  plt.show()
  # summarize history for loss
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'validation'], loc='upper left')
  plt.show()

In [None]:
def create_model(show_model=True):
  from keras.models import Sequential
  from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, GlobalAveragePooling2D, AveragePooling2D, GlobalMaxPooling2D
  from keras.layers import Activation, BatchNormalization

  model = Sequential()

  model.add(Conv2D(filters=64, kernel_size=5, strides=3, padding='same', input_shape=X.shape[1:]))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
    
  model.add(MaxPooling2D(pool_size=2))
  
  model.add(Conv2D(filters=128, kernel_size=3, strides=2, padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  
  model.add(MaxPooling2D(pool_size=2))
  
  model.add(Flatten())

  model.add(Dense(1024, activation='relu'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(Dropout(0.2))
    
  model.add(Dense(1024, activation='relu'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(Dropout(0.2))

  model.add(Dense(num_classes, activation='softmax'))

  if show_model:
    model.summary()
  
  return model

### If you want to do k-fold cross-validation on your model, run the next two cells.

In [None]:
# Run this cell if you want to perform a k-fold cross-validation on your data.
# Warning! Running this could take some time!
# Ensure that you have selected the appropriate cross-validator above, and the LOSO groupings if required.
#
scores = []
for index, (train_idx, val_idx) in enumerate(cv.split(X, Y, groups=cv_groups)):
  print("Training on fold: %s" % index)
  print(len(train_idx), len(val_idx))
  x_train, x_val = X[train_idx], X[val_idx]
  y_train, y_val = Y[train_idx], Y[val_idx]
  
  model = None
  model = create_model(show_model=False)
  model = compile_model(model)
  callbacks = create_callbacks(checkpoint_verbosity=0)
  
  history = model.fit(x_train, y_train, batch_size=16, epochs=25, 
                      validation_split=0.33, callbacks=[callbacks[0]], verbose=0, shuffle=True)
  
  display_learning_curves(history)
  
#   print(x_val.shape, y_val.shape)
#   score = model.evaluate(x_val, y_val, verbose=1)
#   print('\n', 'Test accuracy:', history.history['acc'])
#   print('\n', 'Validation accuracy:', history.history['val_acc'])
  
  print(x_val.shape, y_val.shape)
  model.load_weights('saved_models/weights.best.from_scratch.hdf5')
  score = model.evaluate(x_val, y_val, verbose=1)
  print('\n', 'Test (KFold Validation Set) accuracy:', score[1])
  
  preds = model.predict(x_val)
  preds = np_utils.to_categorical(np.argmax(preds, axis=1), 7)
  print('Score: {}; Acc(WA): {}; UAR: {}'.format(score[1], accuracy_score(y_val, preds), np.mean(recall_score(y_val, preds, average=None))))

  scores.append([index, score[1], score[0], accuracy_score(y_val, preds), np.mean(recall_score(y_val, preds, average=None))])

In [None]:
# Run this cell if you want to see the mean result values for a k-fold cross-val experiment
import pandas as pd

df = pd.DataFrame(scores, columns=['KFold', 'Test Acc', 'Test Loss', 'WA', 'UAR'])
print('Mean WA: ', np.mean(df['WA']))
print('Mean UAR: ', np.mean(df['UAR']))
df

### If you want to perform an XBOW-style LOSO validation or just train a model, use the cells below to do so.

In [None]:
# Uncomment the lines below depending on whether you want a 'vanilla' train/test split
# or an XBOW-style data split (with the validation dataset taken from the XBOW-grouped training data -- more robust!)

# 1. "OLD SCHOOL" (non-KFold i.e. speaker-dependent)
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=43, stratify=Y)

# 2. This will double-check that the cross-validator is working as expected, 
# it should produce results similar to full CV
## train_idx, test_idx = cv.split(X, Y, groups=cv_groups)
## XBOW-style LOSO data split
for (train_idx, test_idx) in cv.split(X, Y, groups=cv_groups):
  break
x_train_tmp, x_test = X[train_idx], X[test_idx]
y_train_tmp, y_test = Y[train_idx], Y[test_idx]

x_train, x_val, y_train, y_val = train_test_split(x_train_tmp, y_train_tmp, test_size=0.33, random_state=43, stratify=y_train_tmp)

summarise_train_val_split()

print(x_val.shape[0], 'X validation samples')
print(y_val.shape[0], 'Y validation samples')

In [None]:
x_train_tmp = None
y_train_tmp = None
history = None

In [None]:
model = None
model = create_model()
model = compile_model(model)
callbacks = create_callbacks()

In [None]:
# Uncomment one or the other validation lines below, depending on your needs.
# For example, if you chose a 'vanilla' train/test split above, just choose 'validation_split'.
# However if you used the XBOW data split, choose the 'validation_data' line.
#
history = model.fit(x_train, y_train, batch_size=16, epochs=25, 
          validation_data=(x_val, y_val),
#           validation_split=0.15,
          callbacks=[callbacks[0]], verbose=1, shuffle=True)

display_learning_curves(history)

### Run the cells below to see the testing results and metrics, as well as classification reports and heatmap confusion matrices.

In [None]:
print(x_test.shape, y_test.shape)
model.load_weights('saved_models/weights.best.from_scratch.hdf5')
score = model.evaluate(x_test, y_test, verbose=1)
print('\n', 'Test accuracy:', score[1])

In [None]:
preds = model.predict(x_test)
preds = np_utils.to_categorical(np.argmax(preds, axis=1), 7)

In [None]:
print(classification_report(y_test, preds, target_names=np.unique(labels)))
print()
print('Score: {}; Acc(WA): {}; UAR: {}'.format(score[1], accuracy_score(y_test, preds), np.mean(recall_score(y_test, preds, average=None))))


train_score = model.evaluate(x_train, y_train, verbose=0)
print('Train Score (measure of bias/variance): {}'.format(train_score[1]))
print('Bias: {}'.format(1-train_score[1]))
print('Variance: {}'.format(train_score[1] - score[1]))

In [None]:
preds = np.argmax(model.predict(x_test), axis=1)
cm = confusion_matrix(np.argmax(y_test, axis=1), preds)

In [None]:
import seaborn as sb

In [None]:
np.unique(np.argmax(y_test, axis=1))

In [None]:
classes = np.unique(labels)

In [None]:
%matplotlib inline

In [None]:
sb.heatmap(pd.DataFrame(cm, index=classes, columns=classes), annot=True)

In [None]:
cm = ((cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100.0)

In [None]:
sb.heatmap(pd.DataFrame(cm, index=classes, columns=classes), annot=True, fmt='2.2f')

In [None]:
# Run this for a simple visualisation of the CNN model.
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_layer_names=False, show_shapes=True).create(prog='dot', format='svg'))