In [1]:
"""
Notebook for neural network models creation and metrics calculation.

Created: 26/02/2019
Author: Silvester Kosmel
"""

from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import librosa, librosa.display
import import_ipynb
import sklearn
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from time import time

import tensorflow as tf
from tensorflow.keras.layers import (Conv2D,
                                     Conv1D,
                                     MaxPooling2D,
                                     Flatten,
                                     Dense,
                                     Dropout,
                                     Lambda,
                                     Reshape,
                                     Permute,
                                     LSTM,
                                     Bidirectional)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
# Set-up for GPU memory sharing
# https://kobkrit.com/using-allow-growth-memory-option-in-tensorflow-and-keras-dc8c8081bc96

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
# tf.keras.backend.set_session(sess)

In [3]:
import preprocessing as prep
import constants as c

importing Jupyter notebook from preprocessing.ipynb
importing Jupyter notebook from audio_prep.ipynb
importing Jupyter notebook from constants.ipynb
0.032 31.25
625 168
importing Jupyter notebook from midi_prep.ipynb


In [1]:
"""Model creation utils"""

def conv2d_reshape(cqt_matrices, midi_matrices):
    """
    Transpose both matrices (spectrogram and MIDI one-hot) to be in format [frames, frequency_bins/one-hot]
    and reshape given wav matrices into the right input shape for Keras Conv2D network: (batch, rows, cols, channels)
    
    Args:
    wav_matrices: array of CQT spectrograms (splitted into frames)
    midi_matrices: labels for frames, represented as one-hot vectors
    
    Return:
    Concatenated arrays of spectrograms and labels, which are used as input for neural network
    """
    
    cqt_reshaped = [np.array(cqt_chunk) for cqt_chunk in cqt_matrices]
    cqt_reshaped = np.concatenate(cqt_reshaped)
    cqt_reshaped = np.array([cqt.T for cqt in cqt_reshaped])
    cqt_reshaped = np.array([cqt.reshape(2*c.CHUNK_PADDING, c.BINS_NUMBER, 1) for cqt in cqt_reshaped])
    
    midi_reshaped = [midi_chunk.T for midi_chunk in midi_matrices]
    midi_reshaped = np.concatenate(midi_reshaped)  
    
    return cqt_reshaped, midi_reshaped

def create_simple_conv_model(input_heigth, input_width, num_classes):
    """
    Create simple convolutional model for one-frame transcription.
    
    Args:
    input_height/input_width: input parameters defining model shape
    num_classes: number of model output classes in Dense layer
    
    Return:
    model: Sequential model with given layers
    """
    
    input_shape = (input_heigth, input_width, 1)
    
    model = Sequential()
    model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='valid', input_shape=input_shape))
    model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='valid'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='same'))
    model.add(Dropout(0.25))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='valid'))
    model.add(MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='same'))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(num_classes, activation='sigmoid'))
    adam_opt = Adam(lr=0.0001, decay=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_opt)
    return model

def create_conv_model(input_heigth, input_width):
    """
    Create convolutional acoustic model for sequence predictions.
    
    Inspired by Kelz 2016 architecture:
        https://github.com/rainerkelz/framewise_2016/blob/master/models.py
    
    Args:
        input_height/input_width: input parameters defining model shape

    Return:
        tf.keras.Model: object with specific 
                        input (batch, input_height, input_width, 1) and
                        output (batch, height, classes) shapes.
    """ 
    
    from tensorflow.keras.layers import Input
    
    inputs = Input(shape=(input_heigth, input_width, 1), name='init_input')
    
    model = Conv2D(filters=32,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='valid',
                   name='conv1')(inputs)
    model = Conv2D(filters=64,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='same', name='conv2')(model)
    
    model = MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='valid', name='max_pool1')(model)
    model = Dropout(0.25, name='drop1')(model)
    
    model = Conv2D(filters=64,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='valid',
                   name='conv3')(model)
    
    model = MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='valid', name='max_pool2')(model)
    model = Dropout(0.25, name='drop2')(model)
    
    shapes = model.shape
    model = Reshape((shapes[1], shapes[2].value * shapes[3].value))(model)
    
    outputs = Conv1D(filters=88,
                     kernel_size=1,
                     activation='sigmoid',
                     padding='valid',
                     name='conv4')(model)
    
    return tf.keras.Model(inputs, outputs)

def create_acoustic_model(input_heigth, input_width):
    """   
    Create acoustic model with combination of convolution and recurrent layers.
    
    Args:
        input_height/input_width: input parameters defining model shape

    Return:
        tf.keras.Model: object with specific 
                        input (batch, input_height, input_width, 1) and
                        output (batch, height, classes) shapes.
    """ 
    
    from tensorflow.keras.layers import Input
    
    inputs = Input(shape=(input_heigth, input_width, 1), name='init_input')
    
    model = Conv2D(filters=32,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='valid',
                   name='conv1')(inputs)
    model = Conv2D(filters=64,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='same', name='conv2')(model)
    
    model = MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='valid', name='max_pool1')(model)
#     model = Dropout(0.25, name='drop1')(model)
    
    model = Conv2D(filters=64,
                   kernel_size=(3, 3),
                   activation='relu',
                   padding='valid',
                   name='conv3')(model)
    
    model = MaxPooling2D(pool_size=(1, 2), strides=(1, 2), padding='valid', name='max_pool2')(model)
#     model = Dropout(0.25, name='drop2')(model)
    
    shapes = model.shape
    model = Reshape((shapes[1], shapes[2].value * shapes[3].value))(model)
    
#     model = Conv1D(filters=768, kernel_size=1, activation='relu', padding='valid', name='conv4')(model)
#     model = Dropout(0.5, name='drop3')(model)
    outputs = LSTM(units=88, activation='sigmoid', return_sequences=True)(model)
#     model = Bidirectional(LSTM(units=128,
#                                  activation='tanh',
#                                  return_sequences=True), name='biLSTM'
#                            )(model)
#     outputs = Conv1D(filters=88, kernel_size=1, activation='sigmoid', padding='valid', name='conv5_output')(model)  
    
    return tf.keras.Model(inputs, outputs)

def train_model(train_x,
                validation_x,
                input_heigth,
                input_width,
                epoch_num=30,
                valid_steps=10,
                epoch_steps=140,
                use_tensorboard=False,
                model_type='conv'):
    """
    Train specific model with given train and validation dataset.
    
    Args:
        train_x: tf.data.Dataset object which yields train inputs as generator
        validation_x: tf.data.Dataset object which yields validation inputs as generator
        input_height/input_width: input parameters defining model shape
        epoch_num: number of epochs
        valid_steps: number of validation steps after each epoch
        epoch_steps: steps per epoch
        use_tensorboard: if model should use TensorBoard
        model_type: type of model to train (conv or lstm)
    
    Return:
        model: trained model, which is ready to be evaluated
        history: callback containing model history (loss and accuracy)
    """
    
   
    adam_opt = Adam(lr=0.006, decay=1e-5)
    
    if model_type == 'conv':
        model = create_conv_model(input_heigth, input_width)
    elif model_type == 'lstm':
        model = create_acoustic_model(input_heigth, input_width)
    else:
        raise ValueError('Wrong model type. Expecting values "conv" or "lstm"')
   
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    
#     checkpoint = ModelCheckpoint('weights.best.hdf5',
#                                  monitor='val_acc',
#                                  verbose=1,
#                                  save_best_only=True,
#                                  mode='max')
    
    if use_tensorboard is True:
        
        tensorboard = TensorBoard(log_dir="logs/{}".format(time()),
                                  write_grads=True,
                                  histogram_freq=1,
                                  write_images=True)
        
        history = model.fit(train_x.make_one_shot_iterator(),
                  validation_data=validation_x.make_one_shot_iterator(),
                  validation_steps=valid_steps,
                  epochs=epoch_num,
                  steps_per_epoch=epoch_steps,
                  verbose=1,
                  callbacks=[tensorboard]
             )
    else:
        history = model.fit(train_x.make_one_shot_iterator(),
                      validation_data=validation_x.make_one_shot_iterator(),
                      validation_steps=valid_steps,
                      epochs=epoch_num,
                      steps_per_epoch=epoch_steps,
                      verbose=1
                 )
    
    return model, history

def plot_history(history):
    """
    Util for plotting model training history.
    
    Args:
        history: tf.keras.callbacks.History object
    """
    
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.tick_params(labelsize=17)
    plt.ylabel('accuracy', fontsize=19)
    plt.xlabel('epoch', fontsize=19)
    plt.legend(['train', 'valid'], loc='upper left')
    # plt.savefig('acc_history_conv.png', format='png')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.tick_params(labelsize=17)
    plt.ylabel('loss', fontsize=19)
    plt.xlabel('epoch', fontsize=19)
    plt.legend(['train', 'valid'], loc='upper left')
    # plt.savefig('loss_history_conv.png', format='png')
    plt.show()

In [5]:
"""Metrics calculation for model predictions"""

def get_test_data(dataset):
    """Generate whole test dataset for model evaluation"""
    
    it = dataset.make_initializable_iterator()
    el = it.get_next()
    
    test_data = []
    with tf.Session() as sess:
        sess.run(it.initializer)
        
        for _ in range(50):
            batch = sess.run(el)
            test_data.append(batch)
    
    return test_data

def get_loss_acc(dataset, model):
    """Get loss and accurancy from model"""
    
    metrics = []
    for cqt, midi in dataset:
        metrics.append(model.test_on_batch(cqt, midi))
    
    return metrics

def get_metrics(dataset, model, threshold=0.4):
    """
    Generate metrics - precision, recall, F1 - for model usign test dataset
    
    Args:
        dataset: generated dataset with tuples in format (cqt, midi)
        model: trained model for predicting
        threshold: predictions thresholds
    
    Return:
        precision, recall and F1 score metrics
    """
    
    
    thresh_predictions = []
    predictions = []
    labels = []
    for tup in dataset:
        prediction = model.predict(tup)
        prediction = np.concatenate(prediction, axis=0)
        predictions.append(prediction)
        
        # apply threshold
        thresh_prediction = np.array([[1 if row > threshold else 0 for row in col] for col in prediction])
        thresh_predictions.append(thresh_prediction)
        
        label = np.concatenate(tup[1], axis=0)
        label = np.array([[1 if row > 0.5 else 0 for row in col] for col in label])
        labels.append(label)
        
    precision_metrics = []
    recall_metrics = []
    F1_metrics = []
    for predict, thresh, lbl in zip(predictions, thresh_predictions, labels):
        tmp_lbl = np.ndarray.flatten(lbl)
        tmp_thresh = np.ndarray.flatten(thresh)
        tmp_predict = np.ndarray.flatten(predict)
        precision, recall, thresholds = precision_recall_curve(tmp_lbl, tmp_predict)
        precision = sklearn.metrics.precision_score(tmp_lbl, tmp_thresh)
        recall = sklearn.metrics.recall_score(tmp_lbl, tmp_thresh)
        F1 = f1_score(tmp_lbl, tmp_thresh)
        precision_metrics.append(precision)
        recall_metrics.append(recall)
        F1_metrics.append(F1)
    
    return precision_metrics, recall_metrics, F1_metrics

def show_metrics(test_dataset, model):
    """Generate and print metrics"""
    
    t_dataset = get_test_data(test_dataset)
    precision_metrics, recall_metrics, F1_score = get_metrics(t_dataset, model)
    print(np.mean(precision_metrics), np.mean(recall_metrics), np.mean(F1_score))

In [6]:
"""
Saved models,
only load if needed.
"""
# model = tf.keras.models.load_model('frames_conv_model1.h5')
# model = tf.keras.models.load_model('onsets_conv_model1.h5')
# model = tf.keras.models.load_model('onsets_lstm_model1.h5')
# model = tf.keras.models.load_model('frames_lstm_model1.h5')

'\nSaved models,\nonly load if needed.\n'

In [18]:
# train_dataset, valid_dataset, test_dataset = prep.get_dataset()

In [1]:
# height, width = c.SEQUENCE_CHUNK_LENGTH + 2*c.CHUNK_PADDING, c.BINS_NUMBER
# model, history = train_model(train_dataset, valid_dataset, height, width, use_tensorboard=True)

In [2]:
# saved_model = tf.keras.models.load_model('weights.best.no3.hdf5')

In [3]:
# new_history = saved_model.fit(train_dataset.make_one_shot_iterator(),
#                   validation_data=valid_dataset.make_one_shot_iterator(),
#                   validation_steps=10,
#                   epochs=34,
#                   steps_per_epoch=140,
#                   verbose=1, initial_epoch=24,
#                   callbacks=[tensorboard, checkpoint]
#              )

In [None]:
# train_dataset, valid_dataset, test_dataset = prep.get_dataset_test()

In [4]:
# show_metrics(test_dataset, saved_model)

In [5]:
# it = test_dataset.make_initializable_iterator()
# cqt_batch = []
# midi_batch = []
# el = it.get_next()
# with tf.Session() as sess:
#     sess.run(it.initializer)

#     batch = sess.run(el)

# saved_model.test_on_batch(batch[0],batch[1])
# model.evaluate_generator(generator=test_dataset.make_one_shot_iterator(), steps=5)

In [6]:
# onset_predict = saved_model.predict(batch[0])

In [143]:
# model.evaluate_generator(test_dataset.make_initializable_iterator(), steps=20)
# train_pairs, valid_pairs, test_pairs = prep.get_datasets_pairs()
# test_dataset_generator = test_data_generator(test_dataset)

In [18]:
# x = np.array([[1 if row>0.4 else 0 for row in col] for col in np.concatenate(onset_predict)])

In [7]:
# prep.mp.plot_piano_roll(x.T)
# plt.xlabel('Time', fontsize=17)
# plt.ylabel('Note', fontsize=17)

In [8]:
# prep.mp.plot_piano_roll(np.concatenate(batch[1]).T)
# plt.xlabel('Time', fontsize=17)
# plt.ylabel('Note', fontsize=17)
# plt.tick_params(labelsize=12)
# plt.savefig("BP.png", format='png')

In [9]:
# x = np.array([[1 if row>0.5 else 0 for row in col] for col in x])
# prep.mp.plot_piano_roll(x.T)
# plt.xlabel('Time', fontsize=17)
# plt.ylabel('Note', fontsize=17)
# plt.tick_params(labelsize=12)
# plt.savefig("BP.png", format='png')

In [8]:
# ws2 = np.array(prep.wav_chunks[0])
# wavs_reshaped2 = np.array([w.T for w in ws2])
# wsr2 = np.array([w.reshape(4, 264, 1) for w in wavs_reshaped2])
# msr2 = prep.midi_chunks[0].T

In [10]:
# prep.mp.plot_piano_roll(midi_predicted[:,:])

In [11]:
# midi_m = prep.pretty_midi.PrettyMIDI('D:\School\Bc\model\MAPS\AkPnBcht\MUS\MAPS_MUS-scn16_2_AkPnBcht.mid')
# midi_m = midi_m.get_piano_roll(fs=c.FRAME_LENGTH)[c.MIDI_MIN:c.MIDI_MAX+1, :]
# prep.mp.plot_piano_roll(midi_m)

In [12]:
# cqt_m = prep.ap.cqt_matrix('D:\School\Bc\model\MAPS\AkPnBsdf\MUS\MAPS_MUS-alb_se3_AkPnBsdf.wav')
# cqt_m = prep.ap.cqt_matrix('\data\shared\MAPS\AkPnBcht\MUS\MAPS_MUS-chpn-p8_AkPnBcht.wav')
# cqt_m = prep.log_normalization(cqt_m)