# Preprocessing of audio and MIDI files

In [1]:
import numpy as np
import pretty_midi
import matplotlib.pyplot as plt
import librosa, librosa.display
import tensorflow as tf
from scipy import stats

from os import listdir

In [2]:
import import_ipynb
import audio_prep as ap, midi_prep as mp, constants

importing Jupyter notebook from audio_prep.ipynb
importing Jupyter notebook from constants.ipynb
importing Jupyter notebook from midi_prep.ipynb


In [4]:
def create_midi_wav_pairs(path):
    """Create pairs of wav and midi files from specified path"""
    
    files = listdir(path)
    wavs = [wav[:-4] for wav in files if wav.endswith('.wav')]
    midis = [midi[:-4] for midi in files if midi.endswith('.mid')]
    
    pairs = []
    for file in wavs:
        if file not in midis:
            # Inform about file without pair and continue
            print('No matching pair for file: ', file)
        else:
            pairs.append((file + '.wav', file + '.mid'))
    return pairs

def load_midi_wav_pairs(path, pairs):
    """Load pairs as WAV and MIDI matrices"""
    
    cqt_matrices = []
    midi_matrices = []
    raw_MIDIs = []
    
    print("Loading ", len(pairs), "files.")
    for i, file in enumerate(pairs):
        # Load WAV file
        cqt_matrix = ap.cqt_matrix(path + '\\' + file[0])
        cqt_matrices.append(np.array(cqt_matrix))
        
        # Load MIDI file
        midi = pretty_midi.PrettyMIDI(path + '\\' + file[1])
        midi_matrix = midi.get_piano_roll(fs=constants.FRAMES_PER_SEC)[constants.MIDI_MIN:constants.MIDI_MAX+1, :]
        midi_matrices.append(np.array(midi_matrix))
        raw_MIDIs.append(midi)
        
        if i % 3 == 1:
            print("Successfully loaded ", i+1 , " file(s)")
            break
    
    print("Loading successfull!")
    return cqt_matrices, midi_matrices, raw_MIDIs

def align_midi_wav_pairs(cqt_matrices,
                         midi_matrices,
                         matrices_type="single_pair"):
    """Align the time shapes of CQT and MIDI metrices"""
    
    if matrices_type == "single_pair":
        print("Aligning single pair of CQT spectrogram and MIDI matrix")
        cqt_length = len(cqt_matrices[0])
        midi_length = len(midi_matrices[0])
        
        if cqt_length > midi_length:
            print("Both matrices aligned to", midi_length, "frames.")
            return np.array(cqt_matrices[:, :midi_length]), np.array(midi_matrices)
        elif cqt_length < midi_length:
            print("Both matrices aligned to ", cqt_length, "frames.")
            return np.array(cqt_matrices), np.array(midi_matrices[:, :cqt_length])
        else:
            print("Same length of matrices on input.")
            return np.array(cqt_matrices), np.array(midi_matrices)
    elif matrices_type == "array":
        aligned_cqts = []
        aligned_midis = []
        for cqt, midi in zip(cqt_matrices, midi_matrices):
            cqt_shape = cqt[0].size
            midi_shape = midi[0].size

            if cqt_shape > midi_shape:
                aligned_cqts.append(np.array(cqt[:, :midi_shape]))
                aligned_midis.append(np.array(midi))
            elif cqt_shape < midi_shape:
                aligned_cqts.append(np.array(cqt))
                aligned_midis.append(np.array(midi[:, :cqt_shape]))
            else:
                aligned_cqts.append(np.array(cqt))
                aligned_midis.append(np.array(midi))
                
        return aligned_cqts, aligned_midis
    else:
        raise ValueError("Wrong matrices_type option. Only 'array' and 'single_pair' types allowed.")

def crop_midi_cqt_pairs(cqt_matrices,
                        midi_matrices,
                        operation_type="sequence",
                        matrices_type='single_pair'):

    if matrices_type == 'single_pair':
        return ap.cqt_split_to_sequence(cqt_matrices), mp.midi_split_to_sequence(midi_matrices)
    
    crop_cqt = []
    crop_midis = []
    if operation_type == "sequence":
        for cqt, midi in zip(cqt_matrices, midi_matrices):
            crop_cqt.append(ap.cqt_split_to_sequence(cqt))
            crop_midis.append(mp.midi_split_to_sequence(midi))
    elif operation_type == "simple":
        for cqt, midi in zip(cqt_matrices, midi_matrices):
            crop_cqt.append(np.array(ap.split_wav(cqt)))
            crop_midis.append(np.array(mp.split_midi(midi)))
    else:
        raise ValueError("Wrong operation type.")
        
    return crop_cqt, crop_midis

def print_shapes(cqt_matrices, midi_matrices):
    """Print shapes of WAV and MIDI metrices"""
    
    for cqt, midi in zip(cqt_matrices, midi_matrices):
        print(cqt.shape, midi.shape)
        
def log_normalization(cqt_matrix):
    """Perform basic logarithmic transformation with zero values shift constant"""
    c = 10e-7
    norm = [np.log(x+c) for x in cqt_matrix]
    
    # Shift to interval (-1,1)
    n = np.min(norm)
    d = np.max(norm) - n    
    norm = [2*((x-n)/d)-1 for x in norm]
    
    return np.asarray(norm)

def std_mean_normalization(cqt_matrix):
    """Normalization based on paper: An End-to-End Neural Network for Polyphonic Piano Music Transcription"""
    std = np.std(cqt_matrix, axis=0)
    mean = np.mean(cqt_matrix, axis=0)
    norm = [(x-mean)/std for x in cqt_matrix]
    
    # Shift to interval (-1,1)
    n = np.min(norm)
    d = np.max(norm) - n    
    norm = [2*((x-n)/d)-1 for x in norm]
    return np.asarray(norm)

def process_data(path, file_pairs):
    for pair in file_pairs:
        cqt_spectrogram = ap.cqt_matrix(path + '\\' + pair[0])
        piano_roll = mp.load_midi_file(path + '\\' + pair[1])        
        frame_one_hot = mp.pretty_midi_to_frame_matrix(piano_roll)       
        cqt_spectrogram, frame_one_hot = align_midi_wav_pairs(cqt_spectrogram,
                                                              frame_one_hot,
                                                              matrices_type="single_pair")
        normalized_cqt_spec = log_normalization(cqt_spectrogram)        
        cqt_spectrogram, frame_one_hot = crop_midi_cqt_pairs(normalized_cqt_spec, frame_one_hot)
        cqt_spectrogram = cqt_spectrogram[:-1]
#         cqt_spectrogram = [cqt.reshape((constants.SEQUENCE_CHUNK_LENGTH + 2*constants.CHUNK_PADDING, constants.BINS_NUMBER, 1)) for cqt in cqt_spectrogram]
        frame_one_hot = frame_one_hot[:-1]
#         yield cqt_spectrogram, frame_one_hot
        for cqt, one_hot in zip(cqt_spectrogram, frame_one_hot):
            cqt = cqt.T
            one_hot = one_hot.T
            cqt = cqt.reshape((constants.SEQUENCE_CHUNK_LENGTH + 2*constants.CHUNK_PADDING, constants.BINS_NUMBER, 1))
            yield cqt, one_hot

def get_dataset():
    path = r'D:\School\Bc\model\MAPS\AkPnBcht\MUS'
    pairs = create_midi_wav_pairs(path)
    generator = lambda: process_data(path, pairs)
    return tf.data.Dataset.from_generator(generator, (tf.float32, tf.float32)).batch(8)

In [35]:
# path = r'D:\School\Bc\model\MAPS\AkPnBcht\MUS'
# specs = r'D:\School\Bc\model\spectrograms'

# pairs = create_midi_wav_pairs(path)

In [1]:
# cqt_matrices, midis, raw_midis = load_midi_wav_pairs(path, pairs)
# onset_midi = [mp.pretty_midi_to_onset_matrix(midi) for midi in raw_midis]
# frame_midi = [mp.pretty_midi_to_frame_matrix(midi) for midi in midis]
# cqt_matrices, frame_midi = align_midi_wav_pairs(cqt_matrices, frame_midi, matrices_type='array')
# cqt_norm = [log_normalization(wav) for wav in cqt_matrices]
# cqt_chunks, midi_chunks = crop_midi_cqt_pairs(cqt_norm, frame_midi, matrices_type='array')

In [5]:
# dataset1 = get_dataset()

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [6]:
# it = dataset1.make_initializable_iterator()

# el = it.get_next()
# with tf.Session() as sess:
#     sess.run(it.initializer)
#     output_sess1, output_sess2 = sess.run(el)
#     print(output_sess1.shape, output_sess2.shape)

Instructions for updating:
Colocations handled automatically by placer.
Aligning single pair of CQT spectrogram and MIDI matrix
Both matrices aligned to 7752 frames.
(627, 264, 1) (625, 88)
