In [1]:
import tensorflow
import tensorflow.keras as keras
from keras.models import Sequential, load_model
from keras.layers import Reshape, Permute, GRU
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, BatchNormalization, Flatten, Dropout, LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
import matplotlib.pyplot as plt
import pandas as pd
import zipfile
import numpy as np

In [2]:
# multi label predictions and accuracy measures
from sklearn.metrics import multilabel_confusion_matrix, classification_report, f1_score

In [3]:
def unzip(zip_file_path, extract_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

In [4]:
def compiler(cnn):
  """
  compiles the model
  by default, the Adam optimizer from keras is used. In cases where this does not work, the optimizer can be chosen differently, e.g. legacy.Adam 
  """
  cnn.compile(optimizer=keras.optimizers.Adam(learning_rate= 0.001),
             loss = keras.losses.binary_crossentropy,
             metrics = keras.metrics.AUC(name="auc",
                                         curve="ROC",
                                         num_labels=10,
                                         multi_label=True))

In [5]:
# optimize thresholds on the validation set maximizing macro f1 score
def optimize_thresholds(y_val, y_pred_val):
    thresholds = []
    
    for i in range(y_val.shape[1]):
        best_threshold = 0
        best_f1 = 0
        
        # Vary threshold from 0 to 1 with a small step size
        for threshold in np.arange(0.1, 1.0, 0.05):
            y_pred_thresholded = (y_pred_val[:, i] >= threshold).astype(int)
            f1 = f1_score(y_val[:, i], y_pred_thresholded)
            
            # Update best threshold if F1-score is higher
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        
        thresholds.append(best_threshold)
    
    return thresholds

# print performance scores
def get_classification_report_val(cnn):
    """
    recalculates the probability output from the model to labels by finding the optimal threshold value for every class
    prints a classification report with all relevant evaluation metrics for the evaluation set
    """
    y_pred_val = cnn.predict(X_val)
    thresholds = optimize_thresholds(y_val, y_pred_val)
    y_pred_val_labels = np.where(y_pred_val >= thresholds, 1, 0)
    
    report = classification_report(
            y_val,
            y_pred_val_labels,
            output_dict=False,
            target_names=['inspiring', 'energetic', 'happy', 'sad', 'motivational', 'relaxing', 'uplifting', 'calm', 'love','hopeful'],
            zero_division=0
            )

    # calculate multi label accuracy using confusion matrix
    confusion_matrix = multilabel_confusion_matrix(y_val, y_pred_val_labels)
    
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0

    for i, matrix in enumerate(confusion_matrix):
        true_positives += matrix[1, 1]
        true_negatives += matrix[0, 0]
        false_positives += matrix[0, 1]
        false_negatives += matrix[1, 0]

    multi_label_accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

    print(report)
    print()
    print("Multi-label accuracy:", multi_label_accuracy)

def get_classification_report_test(cnn):
    """
    recalculates the probability output from the model to labels by finding the optimal threshold value for every class
    prints a classification report with all relevant evaluation metrics for the evaluation set
    """
    y_pred_val = cnn.predict(X_val)
    thresholds = optimize_thresholds(y_val, y_pred_val)
    y_pred_test = cnn.predict(X_test)
    y_pred_test_labels = np.where(y_pred_test >= thresholds, 1, 0)
    
    report = classification_report(
            y_test,
            y_pred_test_labels,
            output_dict=False,
            target_names=['inspiring', 'energetic', 'happy', 'sad', 'motivational', 'relaxing', 'uplifting', 'calm', 'love','hopeful'],
            zero_division=0
            )

    # calculate multi label accuracy using confusion matrix
    confusion_matrix = multilabel_confusion_matrix(y_test, y_pred_test_labels)
    
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0

    for i, matrix in enumerate(confusion_matrix):
        true_positives += matrix[1, 1]
        true_negatives += matrix[0, 0]
        false_positives += matrix[0, 1]
        false_negatives += matrix[1, 0]

    multi_label_accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

    print(report)
    print()
    print("Multi-label accuracy:", multi_label_accuracy)

In [6]:
# make sure to load the right data before calculating scores (start, middle, end)

### CNN1 performance

In [7]:
# load cnn1
unzip('cnn1_middle_L2_Dropout0.5_padding.zip', 'cnn1_middle_L2_Dropout0.5_padding')

In [8]:
cnn1 = tensorflow.keras.models.load_model('cnn1_middle_L2_Dropout0.5_padding')

In [9]:
compiler(cnn1)

In [10]:
cnn1.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_16 (Conv2D)          (None, 1292, 20, 32)      320       
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 646, 10, 32)      0         
 g2D)                                                            
                                                                 
 batch_normalization_16 (Bat  (None, 646, 10, 32)      128       
 chNormalization)                                                
                                                                 
 dropout_18 (Dropout)        (None, 646, 10, 32)       0         
                                                                 
 conv2d_17 (Conv2D)          (None, 646, 10, 64)       18496     
                                                                 
 max_pooling2d_17 (MaxPoolin  (None, 216, 5, 64)      

**Performance with middle MFCCs**

In [11]:
# load test data
X_test = np.load('mfcc30s20_arrays/X_test.npy')
y_test = np.load('mfcc30s20_arrays/y_test.npy')
# load validation data
X_val = np.load('mfcc30s20_arrays/X_val.npy')
y_val = np.load('mfcc30s20_arrays/y_val.npy')

In [12]:
# compute final training evaluation metrics
cnn1_evaluation_val = cnn1.evaluate(x = X_val, y= y_val)



In [13]:
# print the validation evaluation metrics with optimized thresholds
get_classification_report_val(cnn1)

              precision    recall  f1-score   support

   inspiring       1.00      0.01      0.02        83
   energetic       0.38      0.64      0.48       286
       happy       0.29      0.78      0.42       334
         sad       0.03      1.00      0.06        40
motivational       0.15      0.47      0.23       161
    relaxing       0.15      0.55      0.23       186
   uplifting       0.22      0.02      0.04       100
        calm       0.39      0.57      0.46       254
        love       0.20      0.47      0.28       141
     hopeful       0.20      0.01      0.01       142

   micro avg       0.19      0.51      0.27      1727
   macro avg       0.30      0.45      0.22      1727
weighted avg       0.30      0.51      0.30      1727
 samples avg       0.19      0.52      0.27      1727


Multi-label accuracy: 0.667728237791932


In [14]:
cnn1_evaluation_test = cnn1.evaluate(x = X_test, y= y_test)



In [15]:
# print the test evaluation metrics with optimized thresholds
get_classification_report_test(cnn1)

              precision    recall  f1-score   support

   inspiring       0.00      0.00      0.00        96
   energetic       0.36      0.72      0.48       259
       happy       0.31      0.80      0.45       334
         sad       0.03      1.00      0.06        40
motivational       0.17      0.49      0.25       179
    relaxing       0.15      0.52      0.23       193
   uplifting       0.53      0.07      0.12       132
        calm       0.39      0.54      0.45       268
        love       0.20      0.44      0.27       150
     hopeful       0.00      0.00      0.00       132

   micro avg       0.19      0.51      0.28      1783
   macro avg       0.21      0.46      0.23      1783
weighted avg       0.26      0.51      0.30      1783
 samples avg       0.19      0.52      0.27      1783


Multi-label accuracy: 0.6679405520169851


**Performance with start MFCCs**

In [16]:
# load data
X_test = np.load('mfcc30s20_start/X_test.npy')
y_test = np.load('mfcc30s20_start/y_test.npy')

X_val = np.load('mfcc30s20_start/X_val.npy')
y_val = np.load('mfcc30s20_start/y_val.npy')

In [17]:
# compute final training evaluation metrics
cnn1_evaluation_val = cnn1.evaluate(x = X_val, y= y_val)



In [18]:
# print the validation evaluation metrics with optimized thresholds
get_classification_report_val(cnn1)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        97
   energetic       0.40      0.66      0.50       276
       happy       0.33      0.79      0.47       328
         sad       0.03      1.00      0.06        42
motivational       0.15      0.52      0.24       172
    relaxing       0.14      0.58      0.23       178
   uplifting       0.27      0.03      0.06       129
        calm       0.35      0.73      0.47       273
        love       0.16      0.90      0.27       144
     hopeful       0.14      0.01      0.02       121

   micro avg       0.16      0.63      0.26      1760
   macro avg       0.20      0.62      0.24      1760
weighted avg       0.26      0.63      0.32      1760
 samples avg       0.17      0.65      0.26      1760


Multi-label accuracy: 0.5547249647390691


In [19]:
cnn1_evaluation_test = cnn1.evaluate(x = X_test, y= y_test)



In [20]:
# print the test evaluation metrics with optimized thresholds
get_classification_report_test(cnn1)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        99
   energetic       0.37      0.62      0.46       266
       happy       0.34      0.76      0.47       334
         sad       0.03      1.00      0.06        46
motivational       0.19      0.53      0.28       193
    relaxing       0.15      0.50      0.23       202
   uplifting       0.60      0.03      0.05       114
        calm       0.32      0.77      0.45       252
        love       0.17      0.85      0.28       160
     hopeful       0.50      0.02      0.04       152

   micro avg       0.17      0.61      0.26      1818
   macro avg       0.27      0.61      0.25      1818
weighted avg       0.30      0.61      0.31      1818
 samples avg       0.17      0.63      0.26      1818


Multi-label accuracy: 0.5571932299012694


**Performance with end MFCCs**

In [21]:
# load data
X_test = np.load('mfcc30s20_end/X_test.npy')
y_test = np.load('mfcc30s20_end/y_test.npy')

X_val = np.load('mfcc30s20_end/X_val.npy')
y_val = np.load('mfcc30s20_end/y_val.npy')

In [22]:
# compute final training evaluation metrics
# print validation macro ROC-AUC
cnn1_evaluation_val = cnn1.evaluate(x = X_val, y= y_val)



In [23]:
# print the validation evaluation metrics with optimized thresholds
get_classification_report_val(cnn1)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        97
   energetic       0.41      0.66      0.50       276
       happy       0.32      0.75      0.45       328
         sad       0.03      1.00      0.06        42
motivational       0.18      0.59      0.27       172
    relaxing       0.13      0.53      0.21       178
   uplifting       0.12      0.01      0.01       129
        calm       0.37      0.78      0.51       273
        love       0.18      0.82      0.29       144
     hopeful       0.09      1.00      0.16       121

   micro avg       0.15      0.69      0.25      1760
   macro avg       0.19      0.71      0.26      1760
weighted avg       0.25      0.69      0.33      1760
 samples avg       0.15      0.70      0.24      1760


Multi-label accuracy: 0.4814527503526093


In [24]:
# print test macro ROC-AUC
cnn1_evaluation_test = cnn1.evaluate(x = X_test, y= y_test)



In [25]:
# print the test evaluation metrics with optimized thresholds
get_classification_report_test(cnn1)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        99
   energetic       0.43      0.66      0.52       266
       happy       0.33      0.71      0.45       334
         sad       0.03      1.00      0.06        46
motivational       0.19      0.55      0.28       193
    relaxing       0.17      0.63      0.27       202
   uplifting       1.00      0.03      0.05       114
        calm       0.31      0.77      0.45       252
        love       0.17      0.79      0.29       160
     hopeful       0.11      1.00      0.19       152

   micro avg       0.16      0.70      0.26      1818
   macro avg       0.28      0.71      0.27      1818
weighted avg       0.30      0.70      0.33      1818
 samples avg       0.16      0.71      0.25      1818


Multi-label accuracy: 0.4825811001410437


### CNN2 performance

In [26]:
# load cnn2
unzip('CNN2_middle_noL2_Dropuout_0.5.zip', 'CNN2_middle_noL2_Dropuout_0.5')

In [27]:
cnn2 = tensorflow.keras.models.load_model('CNN2_middle_noL2_Dropuout_0.5')

In [28]:
compiler(cnn2)

In [29]:
cnn2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 1290, 18, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 645, 17, 32)      0         
 )                                                               
                                                                 
 batch_normalization (BatchN  (None, 645, 17, 32)      128       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 645, 17, 32)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 643, 15, 64)       18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 321, 14, 64)      0

**Performance with middle MFCCs**

In [30]:
# load test data
X_test = np.load('mfcc30s20_arrays/X_test.npy')
y_test = np.load('mfcc30s20_arrays/y_test.npy')
# load validation data
X_val = np.load('mfcc30s20_arrays/X_val.npy')
y_val = np.load('mfcc30s20_arrays/y_val.npy')

In [31]:
# compute final training evaluation metrics
cnn2_evaluation_val = cnn2.evaluate(x = X_val, y= y_val)



In [32]:
# print the evaluation metrics with optimized thresholds
get_classification_report_val(cnn2)

              precision    recall  f1-score   support

   inspiring       0.18      0.02      0.04        83
   energetic       0.36      0.76      0.49       286
       happy       0.32      0.67      0.43       334
         sad       0.03      1.00      0.06        40
motivational       0.13      0.76      0.22       161
    relaxing       0.16      0.43      0.23       186
   uplifting       0.11      0.44      0.18       100
        calm       0.34      0.65      0.44       254
        love       0.16      0.57      0.26       141
     hopeful       0.13      0.50      0.20       142

   micro avg       0.17      0.61      0.27      1727
   macro avg       0.19      0.58      0.25      1727
weighted avg       0.24      0.61      0.33      1727
 samples avg       0.17      0.60      0.26      1727


Multi-label accuracy: 0.5928520877565464


In [33]:
cnn2_evaluation_test = cnn2.evaluate(x = X_test, y= y_test)



In [34]:
get_classification_report_test(cnn2)

              precision    recall  f1-score   support

   inspiring       0.13      0.02      0.04        96
   energetic       0.33      0.79      0.47       259
       happy       0.31      0.64      0.42       334
         sad       0.03      1.00      0.06        40
motivational       0.15      0.77      0.25       179
    relaxing       0.18      0.48      0.26       193
   uplifting       0.17      0.51      0.26       132
        calm       0.33      0.63      0.43       268
        love       0.16      0.56      0.25       150
     hopeful       0.12      0.52      0.20       132

   micro avg       0.18      0.60      0.27      1783
   macro avg       0.19      0.59      0.26      1783
weighted avg       0.23      0.60      0.32      1783
 samples avg       0.18      0.60      0.26      1783


Multi-label accuracy: 0.5916489738145789


**Performance with start MFCCs**

In [35]:
# load data
X_test = np.load('mfcc30s20_start/X_test.npy')
y_test = np.load('mfcc30s20_start/y_test.npy')

X_val = np.load('mfcc30s20_start/X_val.npy')
y_val = np.load('mfcc30s20_start/y_val.npy')

In [36]:
# compute final training evaluation metrics
cnn2_evaluation_val = cnn2.evaluate(x = X_val, y= y_val)



In [37]:
# print the evaluation metrics with optimized thresholds
get_classification_report_val(cnn2)

              precision    recall  f1-score   support

   inspiring       0.08      0.02      0.03        97
   energetic       0.38      0.15      0.22       276
       happy       0.34      0.19      0.24       328
         sad       0.03      1.00      0.06        42
motivational       0.11      0.59      0.19       172
    relaxing       0.09      0.22      0.12       178
   uplifting       0.09      1.00      0.17       129
        calm       0.23      0.74      0.35       273
        love       0.09      0.41      0.15       144
     hopeful       0.08      0.01      0.01       121

   micro avg       0.11      0.39      0.17      1760
   macro avg       0.15      0.43      0.15      1760
weighted avg       0.20      0.39      0.19      1760
 samples avg       0.11      0.37      0.17      1760


Multi-label accuracy: 0.5464739069111425


In [38]:
cnn2_evaluation_test = cnn2.evaluate(x = X_test, y= y_test)



In [39]:
get_classification_report_test(cnn2)

              precision    recall  f1-score   support

   inspiring       0.08      0.02      0.03        99
   energetic       0.42      0.17      0.24       266
       happy       0.33      0.19      0.24       334
         sad       0.03      1.00      0.06        46
motivational       0.14      0.63      0.23       193
    relaxing       0.12      0.26      0.16       202
   uplifting       0.08      1.00      0.15       114
        calm       0.22      0.78      0.35       252
        love       0.09      0.34      0.14       160
     hopeful       0.07      0.01      0.01       152

   micro avg       0.12      0.38      0.18      1818
   macro avg       0.16      0.44      0.16      1818
weighted avg       0.20      0.38      0.20      1818
 samples avg       0.11      0.38      0.17      1818


Multi-label accuracy: 0.5454866008462623


**Performance with end MFCCs**

In [40]:
# load data
X_test = np.load('mfcc30s20_end/X_test.npy')
y_test = np.load('mfcc30s20_end/y_test.npy')

X_val = np.load('mfcc30s20_end/X_val.npy')
y_val = np.load('mfcc30s20_end/y_val.npy')

In [41]:
# compute final training evaluation metrics
cnn2_evaluation_val = cnn2.evaluate(x = X_val, y= y_val)



In [42]:
# print the evaluation metrics with optimized thresholds
get_classification_report_val(cnn2)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        97
   energetic       0.28      0.06      0.10       276
       happy       0.33      0.08      0.13       328
         sad       0.03      1.00      0.06        42
motivational       0.13      0.52      0.20       172
    relaxing       0.10      0.14      0.12       178
   uplifting       0.25      0.02      0.03       129
        calm       0.22      0.84      0.35       273
        love       0.10      0.30      0.15       144
     hopeful       0.17      0.01      0.02       121

   micro avg       0.11      0.32      0.16      1760
   macro avg       0.17      0.40      0.13      1760
weighted avg       0.21      0.32      0.15      1760
 samples avg       0.11      0.33      0.16      1760


Multi-label accuracy: 0.5772214386459803


In [43]:
cnn2_evaluation_test = cnn2.evaluate(x = X_test, y= y_test)



In [44]:
get_classification_report_test(cnn2)

              precision    recall  f1-score   support

   inspiring       0.07      1.00      0.13        99
   energetic       0.40      0.10      0.16       266
       happy       0.33      0.08      0.13       334
         sad       0.03      1.00      0.06        46
motivational       0.17      0.57      0.26       193
    relaxing       0.13      0.15      0.14       202
   uplifting       0.00      0.00      0.00       114
        calm       0.20      0.83      0.32       252
        love       0.10      0.26      0.15       160
     hopeful       0.09      0.01      0.01       152

   micro avg       0.11      0.33      0.17      1818
   macro avg       0.15      0.40      0.14      1818
weighted avg       0.20      0.33      0.16      1818
 samples avg       0.11      0.34      0.16      1818


Multi-label accuracy: 0.5777150916784203


### CNN3 performance

In [45]:
# load cnn3
unzip('CNN3_Middle_noL2_Dropout_0.2_padding.zip', 'CNN3_Middle_noL2_Dropout_0.2')

In [46]:
cnn3 = tensorflow.keras.models.load_model('CNN3_Middle_noL2_Dropout_0.2')

In [47]:
compiler(cnn3)

In [48]:
cnn3.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 431, 10, 128)      1280      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 216, 5, 128)      0         
 2D)                                                             
                                                                 
 batch_normalization_5 (Batc  (None, 216, 5, 128)      512       
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 216, 5, 128)       0         
                                                                 
 conv2d_5 (Conv2D)           (None, 72, 5, 384)        442752    
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 36, 3, 384)      

**Performance with middle MFCCs**

In [49]:
# load data
X_test = np.load('mfcc30s20_arrays/X_test.npy')
y_test = np.load('mfcc30s20_arrays/y_test.npy')

X_val = np.load('mfcc30s20_arrays/X_val.npy')
y_val = np.load('mfcc30s20_arrays/y_val.npy')

In [50]:
# compute final training evaluation metrics
cnn3_evaluation_val = cnn3.evaluate(x = X_val, y= y_val)



In [51]:
# print the evaluation metrics with optimized thresholds on validation set
get_classification_report_val(cnn3)

              precision    recall  f1-score   support

   inspiring       0.13      0.22      0.17        83
   energetic       0.41      0.71      0.52       286
       happy       0.45      0.58      0.51       334
         sad       0.08      0.03      0.04        40
motivational       0.32      0.30      0.31       161
    relaxing       0.36      0.44      0.40       186
   uplifting       0.30      0.27      0.28       100
        calm       0.34      0.68      0.45       254
        love       0.24      0.31      0.27       141
     hopeful       0.22      0.11      0.14       142

   micro avg       0.35      0.47      0.40      1727
   macro avg       0.29      0.36      0.31      1727
weighted avg       0.34      0.47      0.38      1727
 samples avg       0.38      0.48      0.40      1727


Multi-label accuracy: 0.8300070771408351


In [52]:
# compute performance on test set
cnn3_evaluation_test = cnn3.evaluate(x = X_test, y= y_test)



In [53]:
get_classification_report_test(cnn3)

              precision    recall  f1-score   support

   inspiring       0.15      0.22      0.18        96
   energetic       0.40      0.73      0.52       259
       happy       0.43      0.55      0.48       334
         sad       0.10      0.03      0.04        40
motivational       0.31      0.23      0.27       179
    relaxing       0.35      0.40      0.38       193
   uplifting       0.30      0.22      0.25       132
        calm       0.33      0.62      0.43       268
        love       0.26      0.32      0.29       150
     hopeful       0.17      0.09      0.12       132

   micro avg       0.34      0.43      0.38      1783
   macro avg       0.28      0.34      0.29      1783
weighted avg       0.32      0.43      0.36      1783
 samples avg       0.38      0.46      0.39      1783


Multi-label accuracy: 0.8225053078556264


**Performance with start MFCCs**

In [54]:
# load data
X_test = np.load('mfcc30s20_start/X_test.npy')
y_test = np.load('mfcc30s20_start/y_test.npy')

X_val = np.load('mfcc30s20_start/X_val.npy')
y_val = np.load('mfcc30s20_start/y_val.npy')

In [55]:
# compute final training evaluation metrics
cnn3_evaluation_val = cnn3.evaluate(x = X_val, y= y_val)



In [56]:
# print the evaluation metrics with optimized thresholds on validation set
get_classification_report_val(cnn3)

              precision    recall  f1-score   support

   inspiring       0.10      0.13      0.11        97
   energetic       0.53      0.06      0.10       276
       happy       0.33      0.61      0.42       328
         sad       0.03      1.00      0.06        42
motivational       0.31      0.16      0.21       172
    relaxing       0.23      0.38      0.29       178
   uplifting       0.22      0.18      0.20       129
        calm       0.29      0.45      0.35       273
        love       0.21      0.41      0.27       144
     hopeful       0.17      0.10      0.12       121

   micro avg       0.17      0.33      0.22      1760
   macro avg       0.24      0.35      0.21      1760
weighted avg       0.29      0.33      0.25      1760
 samples avg       0.17      0.34      0.22      1760


Multi-label accuracy: 0.713540197461213


In [57]:
# compute performance on test set
cnn3_evaluation_test = cnn3.evaluate(x = X_test, y= y_test)



In [58]:
get_classification_report_test(cnn3)

              precision    recall  f1-score   support

   inspiring       0.09      0.13      0.10        99
   energetic       0.53      0.08      0.14       266
       happy       0.32      0.61      0.42       334
         sad       0.03      1.00      0.06        46
motivational       0.19      0.07      0.10       193
    relaxing       0.25      0.34      0.29       202
   uplifting       0.12      0.10      0.11       114
        calm       0.22      0.36      0.27       252
        love       0.21      0.36      0.27       160
     hopeful       0.12      0.06      0.08       152

   micro avg       0.15      0.29      0.20      1818
   macro avg       0.21      0.31      0.18      1818
weighted avg       0.25      0.29      0.22      1818
 samples avg       0.15      0.30      0.20      1818


Multi-label accuracy: 0.7031734837799718


**Performance with end MFCCs**

In [59]:
# load data
X_test = np.load('mfcc30s20_end/X_test.npy')
y_test = np.load('mfcc30s20_end/y_test.npy')

X_val = np.load('mfcc30s20_end/X_val.npy')
y_val = np.load('mfcc30s20_end/y_val.npy')

In [60]:
# compute final training evaluation metrics
cnn3_evaluation_val = cnn3.evaluate(x = X_val, y= y_val)



In [61]:
# print the evaluation metrics with optimized thresholds on validation set
get_classification_report_val(cnn3)

              precision    recall  f1-score   support

   inspiring       0.13      0.22      0.16        97
   energetic       0.55      0.35      0.43       276
       happy       0.37      0.51      0.43       328
         sad       0.03      1.00      0.06        42
motivational       0.19      0.12      0.15       172
    relaxing       0.27      0.42      0.33       178
   uplifting       0.28      0.36      0.32       129
        calm       0.33      0.59      0.42       273
        love       0.18      0.33      0.23       144
     hopeful       1.00      0.01      0.02       121

   micro avg       0.19      0.39      0.26      1760
   macro avg       0.33      0.39      0.25      1760
weighted avg       0.36      0.39      0.31      1760
 samples avg       0.19      0.40      0.25      1760


Multi-label accuracy: 0.7225669957686883


In [62]:
# compute performance on test set
cnn3_evaluation_test = cnn3.evaluate(x = X_test, y= y_test)



In [63]:
get_classification_report_test(cnn3)

              precision    recall  f1-score   support

   inspiring       0.12      0.21      0.15        99
   energetic       0.53      0.31      0.39       266
       happy       0.34      0.45      0.39       334
         sad       0.03      1.00      0.06        46
motivational       0.24      0.14      0.18       193
    relaxing       0.26      0.35      0.30       202
   uplifting       0.13      0.17      0.15       114
        calm       0.25      0.48      0.33       252
        love       0.21      0.42      0.28       160
     hopeful       0.00      0.00      0.00       152

   micro avg       0.17      0.33      0.23      1818
   macro avg       0.21      0.35      0.22      1818
weighted avg       0.26      0.33      0.27      1818
 samples avg       0.17      0.34      0.22      1818


Multi-label accuracy: 0.7094499294781382
