In [None]:

import gc
import os
import ast
import sys
import configparser

import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import History
from keras.layers import Reshape
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import keras
from keras import backend as K

from audiomanip.audiostruct import AudioStruct
from audiomanip.audiomodels import ModelZoo
from audiomanip.audioutils import AudioUtils
from audiomanip.audioutils import MusicDataGenerator

# Disable TF warnings about speed up
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'


def main():
  # Parse config file
  config = configparser.ConfigParser()
  config.read('params.ini')

  #Configuration
  GTZAN_FOLDER = config['FILE_READ']['GTZAN_FOLDER']
  MODEL_PATH = config['FILE_READ']['SAVE_MODEL']
  SAVE_NPY = ast.literal_eval(config['FILE_READ']['SAVE_NPY'])
  TENSORBOARD_LOG_DIR = config['FILE_READ']['TENSORBOARD_LOG_DIR']
  EXEC_TIMES = int(config['PARAMETERS_MODEL']['EXEC_TIMES'])
  CNN_TYPE = config['PARAMETERS_MODEL']['CNN_TYPE']
  OPTIMIZER = config['PARAMETERS_MODEL']['OPTIMIZER']

  ## CNN hyperparameters
  batch_size = int(config['PARAMETERS_MODEL']['BATCH_SIZE'])
  epochs = int(config['PARAMETERS_MODEL']['EPOCHS'])

  if not ((CNN_TYPE == '1D') or (CNN_TYPE == '2D') or (CNN_TYPE == 'RNN')):
    raise ValueError('Argument Invalid: The options are 1D or 2D or RNN for CNN_TYPE')

  # Read data
  data_type = config['FILE_READ']['TYPE']
  input_shape = (1280, 128)
  print("data_type: %s" % data_type)

  ## Read the .au files
  if data_type == 'AUDIO_FILES':
    song_rep = AudioStruct(GTZAN_FOLDER, config)
    songs, genres = song_rep.getdata()

    # Save the audio files as npy files to read faster next time
    if SAVE_NPY:
      np.save(GTZAN_FOLDER + 'songs.npy', songs)
      np.save(GTZAN_FOLDER + 'genres.npy', genres)

  ## Read from npy file
  elif data_type == 'NPY':
    songs = np.load(GTZAN_FOLDER + 'songs.npy')
    genres = np.load(GTZAN_FOLDER + 'genres.npy')

  ## Not valid datatype
  else:
    raise ValueError('Argument Invalid: The options are AUDIO_FILES or NPY for data_type')

  print("Original songs array shape: {0}".format(songs.shape))
  print("Original genre array shape: {0}".format(genres.shape))

  # Train multiple times and get mean score
  val_acc = []
  test_history = []
  test_acc = []
  test_acc_mvs = []

  best_acc = 0
  best_cnn = None


  # Tensorboard Callback Definition
  K.set_learning_phase(1) #set learning phase

  for x in range(EXEC_TIMES):
    keras.backend.clear_session()
    tbCallBack = keras.callbacks.TensorBoard(log_dir=TENSORBOARD_LOG_DIR,
     histogram_freq=3,
     write_grads=True,
     write_graph=True,
     write_images=True)

    # Split the dataset into training and test
    X_train, X_test, y_train, y_test = train_test_split(
      songs, genres, test_size=0.1, stratify=genres)

    # Split training set into training and validation
    X_train, X_Val, y_train, y_val = train_test_split(
      X_train, y_train, test_size=1/6, stratify=y_train)

    # split the train, test and validation data in size 128x128
    X_Val, y_val = AudioUtils().splitsongs_melspect(X_Val, y_val, CNN_TYPE)
    X_test, y_test = AudioUtils().splitsongs_melspect(X_test, y_test, CNN_TYPE)
    X_train, y_train = AudioUtils().splitsongs_melspect(X_train, y_train, CNN_TYPE)

    # Construct the model
    if CNN_TYPE == '1D':
      cnn = ModelZoo.cnn_melspect_1D(input_shape)
    elif CNN_TYPE == '2D':
      cnn = ModelZoo.cnn_melspect_2D((*input_shape, 1))
    elif CNN_TYPE == 'RNN':
      cnn = ModelZoo.crnn_melspect_2D((*input_shape, 1))

    print("\nTrain shape: {0}".format(X_train.shape))
    print("Validation shape: {0}".format(X_Val.shape))
    print("Test shape: {0}\n".format(X_test.shape))
    print("Size of the CNN: %s\n" % cnn.count_params())

    # Optimizers
    if OPTIMIZER == 'sgd':
      opt = keras.optimizers.SGD(lr=0.001, momentum=0.9, decay=1e-5, nesterov=True)
    elif OPTIMIZER == 'adam':
      opt = keras.optimizers.Adam(lr=5e-3) # lr=0.001 #, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-5)

    # Compiler for the model
    cnn.compile(loss=keras.losses.categorical_crossentropy, #loss=keras.losses.categorical_crossentropy,
      optimizer=opt,
      metrics=['accuracy'])

    # Early stop
    earlystop = keras.callbacks.EarlyStopping(monitor='val_loss',
      min_delta=0,
      patience=2,
      verbose=0,
      mode='auto')

    # Fit the model
    history = cnn.fit(X_train, y_train,
      batch_size=batch_size,
      epochs=epochs,
      verbose=1,
      validation_data=(X_Val, y_val),
      callbacks = [earlystop])
    print('history: ', history.history['acc'])

    score = cnn.evaluate(X_test, y_test, verbose=0)
    score_val = cnn.evaluate(X_Val, y_val, verbose=0)

    # Majority Voting System
    pred_org_values = cnn.predict(X_test)
    pred_label_values = np.argmax(pred_org_values, axis = 1)
    mvs_truth, mvs_res = AudioUtils().voting(np.argmax(y_test, axis = 1), pred_label_values)
    acc_mvs = accuracy_score(mvs_truth, mvs_res)
    mvs_roc_auc = roc_auc_score(y_test, pred_org_values)


    # Save metrics
    val_acc.append(score_val[1])
    test_acc.append(score[1])
    test_history.append(history)
    test_acc_mvs.append(acc_mvs)

    # Print metrics
    print('Test accuracy:', score[1])
    print('Test accuracy for Majority Voting System:', acc_mvs)
    print('Test auc_roc_score for Majority Voting System:', mvs_roc_auc)

    # Print the confusion matrix for Voting System
    cm = confusion_matrix(mvs_truth, mvs_res)
    print(cm)

    # Records Best Model
    if (best_acc < acc_mvs):
        best_acc = acc_mvs
        best_cnn = cnn
        best_history = history
        print('best_history:', best_history.history['acc'])
        print('best_acc changed:', best_acc)

  # Print the statistics
  print("Validation accuracy - mean: %s, std: %s" % (np.mean(val_acc), np.std(val_acc)))
  print("Test accuracy - mean: %s, std: %s" % (np.mean(test_acc), np.std(test_acc)))
  print("Test accuracy MVS - mean: %s, std: %s" % (np.mean(test_acc_mvs), np.std(test_acc_mvs)))

  # summarize history for accuracy
  print('best_acc:', best_acc)
  plt.plot(best_history.history['acc'])
  plt.plot(best_history.history['val_acc'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

  # summarize history for loss
  plt.plot(best_history.history['loss'])
  plt.plot(best_history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

  # Save the model
  best_cnn.save(MODEL_PATH)

  # Free memory
  del songs
  del genres
  gc.collect()


In [None]:
if __name__ == '__main__':
  main()

data_type: NPY
Original songs array shape: (1000, 1280, 128)
Original genre array shape: (1000, 10)
original X shape:  (150, 1280, 128)
2D X shape:  (150, 1280, 128, 1)
original X shape:  (100, 1280, 128)
2D X shape:  (100, 1280, 128, 1)
original X shape:  (750, 1280, 128)
2D X shape:  (750, 1280, 128, 1)
input_shape:  (1280, 128, 1)


  bn1 = BatchNormalization(axis=channel_axis, mode=0, name='bn1')(conv1)
  bn2 = BatchNormalization(axis=channel_axis, mode=0, name='bn2')(conv2)
  bn3 = BatchNormalization(axis=channel_axis, mode=0, name='bn3')(conv3)
  bn4 = BatchNormalization(axis=channel_axis, mode=0, name='bn4')(conv4)


dr4shape: (None, 25, 1, 128)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1280, 128, 1)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 1278, 126, 64)     640       
_________________________________________________________________
bn1 (BatchNormalization)     (None, 1278, 126, 64)     5112      
_________________________________________________________________
elu_1 (ELU)                  (None, 1278, 126, 64)     0         
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 639, 63, 64)       0         
_________________________________________________________________
dropout1 (Dropout)           (None, 639, 63, 64)       0         
_________________________________________________________________
conv2 (Conv2D)               (None, 637, 61, 12

In [None]:
config = configparser.ConfigParser()
config.read('params.ini')

#Configuration
GTZAN_FOLDER = config['FILE_READ']['GTZAN_FOLDER']
MODEL_PATH = config['FILE_READ']['SAVE_MODEL']
SAVE_NPY = ast.literal_eval(config['FILE_READ']['SAVE_NPY'])
TENSORBOARD_LOG_DIR = config['FILE_READ']['TENSORBOARD_LOG_DIR']
EXEC_TIMES = int(config['PARAMETERS_MODEL']['EXEC_TIMES'])
CNN_TYPE = config['PARAMETERS_MODEL']['CNN_TYPE']
OPTIMIZER = config['PARAMETERS_MODEL']['OPTIMIZER']

## CNN hyperparameters
batch_size = int(config['PARAMETERS_MODEL']['BATCH_SIZE'])
epochs = int(config['PARAMETERS_MODEL']['EPOCHS'])

if not ((CNN_TYPE == '1D') or (CNN_TYPE == '2D') or (CNN_TYPE == 'RNN')):
  raise ValueError('Argument Invalid: The options are 1D or 2D or RNN for CNN_TYPE')

# Read data
data_type = config['FILE_READ']['TYPE']
input_shape = (128, 128)
print("data_type: %s" % data_type)

## Read the .au files
if data_type == 'AUDIO_FILES':
  song_rep = AudioStruct(GTZAN_FOLDER)
  songs, genres = song_rep.getdata()

  # Save the audio files as npy files to read faster next time
  if SAVE_NPY:
    np.save(GTZAN_FOLDER + 'songs.npy', songs)
    np.save(GTZAN_FOLDER + 'genres.npy', genres)

## Read from npy file
elif data_type == 'NPY':
  songs = np.load(GTZAN_FOLDER + 'songs.npy')
  genres = np.load(GTZAN_FOLDER + 'genres.npy')

## Not valid datatype
else:
  raise ValueError('Argument Invalid: The options are AUDIO_FILES or NPY for data_type')

print("Original songs array shape: {0}".format(songs.shape))
print("Original genre array shape: {0}".format(genres.shape))

# Train multiple times and get mean score
val_acc = []
test_history = []
test_acc = []
test_acc_mvs = []

best_acc = 0
best_cnn = None
best_history = None

# Tensorboard Callback Definition
K.set_learning_phase(1) #set learning phase

In [25]:
keras.backend.clear_session()
tbCallBack = keras.callbacks.TensorBoard(log_dir=TENSORBOARD_LOG_DIR,
 histogram_freq=3,
 write_grads=True,
 write_graph=True,
 write_images=True)

# Split the dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(
  songs, genres, test_size=0.1, stratify=genres)

# Split training set into training and validation
X_train, X_Val, y_train, y_val = train_test_split(
  X_train, y_train, test_size=1/6, stratify=y_train)

# split the train, test and validation data in size 128x128
X_Val, y_val = AudioUtils().splitsongs_melspect(X_Val, y_val, CNN_TYPE)
X_test, y_test = AudioUtils().splitsongs_melspect(X_test, y_test, CNN_TYPE)
X_train, y_train = AudioUtils().splitsongs_melspect(X_train, y_train, CNN_TYPE)

# Construct the model
if CNN_TYPE == '1D':
  cnn = ModelZoo.cnn_melspect_1D(input_shape)
elif CNN_TYPE == '2D':
  cnn = ModelZoo.cnn_melspect_2D((*input_shape, 1))
elif CNN_TYPE == 'RNN':
  cnn = ModelZoo.crnn_melspect_2D((*input_shape, 1))

print("\nTrain shape: {0}".format(X_train.shape))
print("Validation shape: {0}".format(X_Val.shape))
print("Test shape: {0}\n".format(X_test.shape))
print("Size of the CNN: %s\n" % cnn.count_params())

# Optimizers
if OPTIMIZER == 'sgd':
  sgd = keras.optimizers.SGD(lr=0.001, momentum=0.9, decay=1e-5, nesterov=True)
elif OPTIMIZER == 'adam':
  adam = keras.optimizers.Adam(lr=5e-3) # lr=0.001 #, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-5)

# Compiler for the model
cnn.compile(loss='binary_crossentropy', #loss=keras.losses.categorical_crossentropy,
  optimizer=adam,
  metrics=['accuracy'])

# Early stop
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss',
  min_delta=0,
  patience=2,
  verbose=0,
  mode='auto')

# Fit the model
history = cnn.fit(X_train, y_train,
  batch_size=batch_size,
  epochs=epochs,
  verbose=1,
  validation_data=(X_Val, y_val),
  callbacks = [earlystop])

score = cnn.evaluate(X_test, y_test, verbose=0)
score_val = cnn.evaluate(X_Val, y_val, verbose=0)

# Majority Voting System
pred_values = np.argmax(cnn.predict(X_test), axis = 1)
mvs_truth, mvs_res = AudioUtils().voting(np.argmax(y_test, axis = 1), pred_values)
acc_mvs = accuracy_score(mvs_truth, mvs_res)
mvs_roc_auc = roc_auc_score(mvs_truth, mvs_res)

# Save metrics
val_acc.append(score_val[1])
test_acc.append(score[1])
test_history.append(history)
test_acc_mvs.append(acc_mvs)

# Print metrics
print('Test accuracy:', score[1])
print('Test accuracy for Majority Voting System:', acc_mvs)
print('Test auc_roc_score for Majority Voting System:', mvs_roc_auc)

# Print the confusion matrix for Voting System
cm = confusion_matrix(mvs_truth, mvs_res)
print(cm)

# Records Best Model
if (best_acc < acc_mvs):
    best_acc = acc_mvs
    best_cnn = cnn
    best_history = history
    print('best_acc changed:', best_acc)

original X shape:  (1500, 128, 128)
2D X shape:  (1500, 128, 128, 1)
original X shape:  (1000, 128, 128)
2D X shape:  (1000, 128, 128, 1)
original X shape:  (7500, 128, 128)
2D X shape:  (7500, 128, 128, 1)
input_shape:  (128, 128, 1)


  bn1 = BatchNormalization(axis=channel_axis, mode=0, name='bn1')(conv1)
  bn2 = BatchNormalization(axis=channel_axis, mode=0, name='bn2')(conv2)
  bn3 = BatchNormalization(axis=channel_axis, mode=0, name='bn3')(conv3)
  bn4 = BatchNormalization(axis=channel_axis, mode=0, name='bn4')(conv4)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128, 128, 1)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 126, 126, 64)      640       
_________________________________________________________________
bn1 (BatchNormalization)     (None, 126, 126, 64)      504       
_________________________________________________________________
elu_1 (ELU)                  (None, 126, 126, 64)      0         
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 63, 63, 64)        0         
_________________________________________________________________
dropout1 (Dropout)           (None, 63, 63, 64)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 61, 61, 128)       73856     
__________

ValueError: multiclass format is not supported

In [26]:
mvs_truth, mvs_res

(array([2, 3, 2, 4, 8, 6, 1, 8, 9, 0, 4, 8, 6, 7, 8, 2, 7, 6, 9, 9, 3, 3,
        2, 9, 6, 5, 0, 3, 9, 1, 7, 5, 7, 1, 6, 2, 6, 4, 0, 0, 8, 4, 0, 0,
        0, 3, 5, 9, 5, 9, 6, 3, 4, 5, 9, 1, 6, 7, 9, 1, 5, 3, 3, 3, 7, 8,
        7, 4, 7, 9, 1, 2, 4, 6, 8, 2, 8, 1, 7, 1, 5, 2, 8, 0, 2, 4, 3, 0,
        1, 1, 8, 2, 4, 5, 5, 6, 4, 0, 5, 7]),
 array([9, 3, 2, 5, 1, 4, 9, 9, 2, 9, 2, 9, 3, 5, 2, 2, 5, 3, 0, 9, 3, 3,
        2, 9, 2, 5, 9, 0, 9, 9, 0, 4, 9, 1, 3, 2, 3, 2, 9, 9, 4, 4, 9, 9,
        9, 1, 9, 9, 9, 5, 5, 3, 5, 5, 4, 3, 3, 0, 9, 1, 4, 8, 1, 1, 5, 8,
        5, 2, 9, 2, 1, 2, 9, 3, 9, 2, 9, 9, 4, 3, 9, 2, 2, 0, 2, 4, 3, 9,
        9, 5, 1, 2, 4, 9, 4, 3, 2, 9, 9, 9]))

In [47]:
#np.argmax(y_test, axis = 1)
pred_values = np.argmax(cnn.predict(X_test), axis = 1)
pred_values

array([5, 5, 5, 9, 9, 9, 9, 9, 5, 9, 3, 3, 3, 1, 1, 1, 0, 0, 0, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 4, 5, 4, 5, 4, 5, 4, 5, 1, 9, 0, 1,
       9, 9, 9, 1, 0, 1, 9, 5, 2, 4, 9, 5, 4, 2, 4, 4, 9, 5, 5, 5, 9, 9,
       5, 9, 0, 9, 9, 8, 9, 8, 9, 8, 9, 9, 9, 8, 2, 4, 2, 4, 2, 2, 2, 2,
       2, 4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 2, 4, 4, 4, 2, 2, 2, 2, 4, 2,
       0, 5, 9, 9, 5, 1, 9, 1, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, 5, 9,
       4, 5, 5, 5, 4, 5, 4, 4, 2, 2, 2, 2, 5, 5, 4, 4, 4, 9, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 9, 4, 4, 5, 5, 5, 9, 5, 9, 2, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 0, 9, 0, 3, 0, 9, 1, 0, 3, 9, 5, 5, 9, 9, 9, 5, 4, 9,
       2, 9, 1, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 9, 3, 0, 3, 3, 0, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 5, 9, 9, 5, 9, 9, 5, 9, 9, 4, 2,
       4, 2, 2, 2, 4, 2, 2, 2, 5, 9, 5, 5, 5, 5, 9, 9, 9, 5, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 1, 1, 1, 9, 0, 1, 5, 9, 9, 9, 9, 5,
       9, 5, 9, 9, 1, 1, 9, 9, 9, 0, 9, 0, 0, 9, 0,

In [70]:
pred_values =cnn.predict(X_test)

In [71]:
y_test, pred_values

(array([[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]]),
 array([[0.1236752 , 0.04578517, 0.05659752, ..., 0.19004093, 0.07126129,
         0.21616907],
        [0.10009772, 0.04310775, 0.0605137 , ..., 0.16417207, 0.07075483,
         0.17629012],
        [0.10974169, 0.04142744, 0.0705883 , ..., 0.1844229 , 0.06655623,
         0.200849  ],
        ...,
        [0.16567132, 0.07175727, 0.0189496 , ..., 0.1634024 , 0.09061007,
         0.21569157],
        [0.14766963, 0.05910565, 0.03124412, ..., 0.1842281 , 0.08690333,
         0.22420873],
        [0.19126137, 0.13312256, 0.00873672, ..., 0.12118168, 0.11524045,
         0.19143613]], dtype=float32))

In [73]:
roc_auc_score(y_test, pred_values)

0.7527016666666666