In [1]:
# tests with spectrograms with different configuration parameters
# the spectrograms are created from the MusicNet dataset

import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

from utils import compute_spectrogram

from datetime import datetime

In [3]:
# general parameter initialisation
# obs: the time in secs has been chosen to get a 500 frames in every spectrogram
sr = 44100                                              # sampling rate
fft_size = 1024                                         # size of FFT
hop_size = 512                                          # hop size (overlap)
duration = 18                                           # segment duration (sec)
no_mels = 128                                           # no. of mel bins
no_fft = 1600                                            # frames in spectrogram
chunk_len = np.round(sr * duration).astype(int)         # segment duration (samples)
print('Audio chunk is', chunk_len, 'frames long')

# load the dataset structure
path = 'musicnet.npz'
data = np.load(path, 'rb')
no_spectrograms = len(data.files)                       # total number of spectrograms
ids = data.files                                        # get the id for each recording (list)
print(len(ids), ' recordings in the dataset')

# variables initialisation
dataset = np.empty([no_mels, no_fft, 0])                # empty 3-d array
X = np.empty(chunk_len)                                 # store audio recording

# select files randomly
no_files = 50
idx = np.round(np.random.rand(no_files,1)*len(data.files)).astype(int)
print('Selected indexes: ',' '.join(str(id) for id in idx))

tmp = ' '.join(str(ids[idx.item(n)]) for n in range(no_files)) 
ids = tmp.split(' ')
print('Corresponding to files:',ids)

Audio chunk is 396900 frames long
330  recordings in the dataset
Selected indexes:  [225] [43] [32] [107] [85] [314] [36] [41] [317] [219] [175] [231] [112] [103] [51] [113] [271] [107] [170] [298]
Corresponding to files: ['2366', '1792', '1752', '2383', '2147', '2230', '2242', '2463', '2234', '2588', '2483', '2211', '2178', '2307', '1859', '2179', '2343', '2383', '2293', '2308']


In [None]:
# spectrograms computations 
print('Computation started at ', datetime.now().time())
for id in ids:

    # extract the corresponding audio recording
    X = data[id][0]
    print('File extracted')

    # how many chunks?
    chunks = np.round(len(X)/chunk_len)
    print('Chunks found:', chunks)

    for index in range(chunks):
        input = X[index*chunk_len:(index+1)*chunk_len]

        [S, dims] = compute_spectrogram(
            audiofile=input,
            sr=sr,
            no_mels=no_mels,
            fft_size=fft_size,
            hop_size=hop_size,
        )
        
        # zero-padding if spectrogram is smaller
        S = np.concatenate((S, np.zeros([128,1600-S.shape[1]])),axis=1)
        # reshape spectrogram
        S = np.reshape(S, newshape=[dims[0], 1600, 1])
        #print('S dimensions:',S.shape)
        dataset = np.concatenate((dataset, S), axis=2)


# save the dataset
np.savez_compressed(file='dataset9sec', dataset=dataset)
print('Saved and successfully finished at ', datetime.now().time())