/
extractor.py
238 lines (192 loc) · 10.3 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import numpy as np
import librosa
import tensorflow as tf
# disable eager mode for tf.v1 compatibility with tf.v2
tf.compat.v1.disable_eager_execution()
from musicnn import models
from musicnn import configuration as config
def batch_data(audio_file, n_frames, overlap):
'''For an efficient computation, we split the full music spectrograms in patches of length n_frames with overlap.
INPUT
- file_name: path to the music file to tag.
Data format: string.
Example: './audio/TRWJAZW128F42760DD_test.mp3'
- n_frames: length (in frames) of the input spectrogram patches.
Data format: integer.
Example: 187
- overlap: ammount of overlap (in frames) of the input spectrogram patches.
Note: Set it considering n_frames.
Data format: integer.
Example: 10
OUTPUT
- batch: batched audio representation. It returns spectrograms split in patches of length n_frames with overlap.
Data format: 3D np.array (batch, time, frequency)
- audio_rep: raw audio representation (spectrogram).
Data format: 2D np.array (time, frequency)
'''
# compute the log-mel spectrogram with librosa
audio, sr = librosa.load(audio_file, sr=config.SR)
audio_rep = librosa.feature.melspectrogram(y=audio,
sr=sr,
hop_length=config.FFT_HOP,
n_fft=config.FFT_SIZE,
n_mels=config.N_MELS).T
audio_rep = audio_rep.astype(np.float16)
audio_rep = np.log10(10000 * audio_rep + 1)
# batch it for an efficient computing
first = True
last_frame = audio_rep.shape[0] - n_frames + 1
# +1 is to include the last frame that range would not include
for time_stamp in range(0, last_frame, overlap):
patch = np.expand_dims(audio_rep[time_stamp : time_stamp + n_frames, : ], axis=0)
if first:
batch = patch
first = False
else:
batch = np.concatenate((batch, patch), axis=0)
return batch, audio_rep
def extractor(file_name, model='MTT_musicnn', input_length=3, input_overlap=False, extract_features=True):
'''Extract the taggram (the temporal evolution of tags) and features (intermediate representations of the model) of the music-clip in file_name with the selected model.
INPUT
- file_name: path to the music file to tag.
Data format: string.
Example: './audio/TRWJAZW128F42760DD_test.mp3'
- model: select a music audio tagging model.
Data format: string.
Options: 'MTT_musicnn', 'MTT_vgg', 'MSD_musicnn', 'MSD_musicnn_big' or 'MSD_vgg'.
MTT models are trained with the MagnaTagATune dataset.
MSD models are trained with the Million Song Dataset.
To know more about these models, check our musicnn / vgg examples, and the FAQs.
Important! 'MSD_musicnn_big' is only available if you install from source: python setup.py install.
- input_length: length (in seconds) of the input spectrogram patches. Set it small for real-time applications.
Note: This is the length of the data that is going to be fed to the model. In other words, this parameter defines the temporal resolution of the taggram.
Recommended value: 3, because the models were trained with 3 second inputs.
Observation: the vgg models do not allow for different input lengths. For this reason, the vgg models' input_length needs to be set to 3. However, musicnn models allow for different input lengths: see this jupyter notebook.
Data format: floating point number.
Example: 3.1
- input_overlap: ammount of overlap (in seconds) of the input spectrogram patches.
Note: Set it considering the input_length.
Data format: floating point number.
Example: 1.0
- extract_features: set it True for extracting the intermediate representations of the model.
Data format: boolean.
Options: False (for NOT extracting the features), True (for extracting the features).
OUTPUT
- taggram: expresses the temporal evolution of the tags likelihood.
Data format: 2D np.ndarray (time, tags).
Example: see our basic / advanced examples.
- tags: list of tags corresponding to the tag-indices of the taggram.
Data format: list.
Example: see our FAQs page for the complete tags list.
- features: if extract_features = True, it outputs a dictionary containing the activations of the different layers the selected model has.
Data format: dictionary.
Keys (musicnn models): ['timbral', 'temporal', 'cnn1', 'cnn2', 'cnn3', 'mean_pool', 'max_pool', 'penultimate']
Keys (vgg models): ['pool1', 'pool2', 'pool3', 'pool4', 'pool5']
Example: see our musicnn and vgg examples.
'''
# select model
if 'MTT' in model:
labels = config.MTT_LABELS
elif 'MSD' in model:
labels = config.MSD_LABELS
num_classes = len(labels)
if 'vgg' in model and input_length != 3:
raise ValueError('Set input_length=3, the VGG models cannot handle different input lengths.')
# convert seconds to frames
n_frames = librosa.time_to_frames(input_length, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) + 1
if not input_overlap:
overlap = n_frames
else:
overlap = librosa.time_to_frames(input_overlap, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP)
# tensorflow: define the model
tf.compat.v1.reset_default_graph()
with tf.name_scope('model'):
x = tf.compat.v1.placeholder(tf.float32, [None, n_frames, config.N_MELS])
is_training = tf.compat.v1.placeholder(tf.bool)
if 'vgg' in model:
y, pool1, pool2, pool3, pool4, pool5 = models.define_model(x, is_training, model, num_classes)
else:
y, timbral, temporal, cnn1, cnn2, cnn3, mean_pool, max_pool, penultimate = models.define_model(x, is_training, model, num_classes)
normalized_y = tf.nn.sigmoid(y)
# tensorflow: loading model
sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.global_variables_initializer())
saver = tf.compat.v1.train.Saver()
try:
saver.restore(sess, os.path.dirname(__file__)+'/'+model+'/')
except:
if model == 'MSD_musicnn_big':
raise ValueError('MSD_musicnn_big model is only available if you install from source: python setup.py install')
elif model == 'MSD_vgg':
raise ValueError('MSD_vgg model is still training... will be available soon! :)')
# batching data
print('Computing spectrogram (w/ librosa) and tags (w/ tensorflow)..', end =" ")
batch, spectrogram = batch_data(file_name, n_frames, overlap)
# tensorflow: extract features and tags
# ..first batch!
if extract_features:
if 'vgg' in model:
extract_vector = [normalized_y, pool1, pool2, pool3, pool4, pool5]
else:
extract_vector = [normalized_y, timbral, temporal, cnn1, cnn2, cnn3, mean_pool, max_pool, penultimate]
else:
extract_vector = [normalized_y]
tf_out = sess.run(extract_vector,
feed_dict={x: batch[:config.BATCH_SIZE],
is_training: False})
if extract_features:
if 'vgg' in model:
predicted_tags, pool1_, pool2_, pool3_, pool4_, pool5_ = tf_out
features = dict()
features['pool1'] = np.squeeze(pool1_)
features['pool2'] = np.squeeze(pool2_)
features['pool3'] = np.squeeze(pool3_)
features['pool4'] = np.squeeze(pool4_)
features['pool5'] = np.squeeze(pool5_)
else:
predicted_tags, timbral_, temporal_, cnn1_, cnn2_, cnn3_, mean_pool_, max_pool_, penultimate_ = tf_out
features = dict()
features['timbral'] = np.squeeze(timbral_)
features['temporal'] = np.squeeze(temporal_)
features['cnn1'] = np.squeeze(cnn1_)
features['cnn2'] = np.squeeze(cnn2_)
features['cnn3'] = np.squeeze(cnn3_)
features['mean_pool'] = mean_pool_
features['max_pool'] = max_pool_
features['penultimate'] = penultimate_
else:
predicted_tags = tf_out[0]
taggram = np.array(predicted_tags)
# ..rest of the batches!
for id_pointer in range(config.BATCH_SIZE, batch.shape[0], config.BATCH_SIZE):
tf_out = sess.run(extract_vector,
feed_dict={x: batch[id_pointer:id_pointer+config.BATCH_SIZE],
is_training: False})
if extract_features:
if 'vgg' in model:
predicted_tags, pool1_, pool2_, pool3_, pool4_, pool5_ = tf_out
features['pool1'] = np.concatenate((features['pool1'], np.squeeze(pool1_)), axis=0)
features['pool2'] = np.concatenate((features['pool2'], np.squeeze(pool2_)), axis=0)
features['pool3'] = np.concatenate((features['pool3'], np.squeeze(pool3_)), axis=0)
features['pool4'] = np.concatenate((features['pool4'], np.squeeze(pool4_)), axis=0)
features['pool5'] = np.concatenate((features['pool5'], np.squeeze(pool5_)), axis=0)
else:
predicted_tags, timbral_, temporal_, midend1_, midend2_, midend3_, mean_pool_, max_pool_, penultimate_ = tf_out
features['timbral'] = np.concatenate((features['timbral'], np.squeeze(timbral_)), axis=0)
features['temporal'] = np.concatenate((features['temporal'], np.squeeze(temporal_)), axis=0)
features['cnn1'] = np.concatenate((features['cnn1'], np.squeeze(cnn1_)), axis=0)
features['cnn2'] = np.concatenate((features['cnn2'], np.squeeze(cnn2_)), axis=0)
features['cnn3'] = np.concatenate((features['cnn3'], np.squeeze(cnn3_)), axis=0)
features['mean_pool'] = np.concatenate((features['mean_pool'], mean_pool_), axis=0)
features['max_pool'] = np.concatenate((features['max_pool'], max_pool_), axis=0)
features['penultimate'] = np.concatenate((features['penultimate'], penultimate_), axis=0)
else:
predicted_tags = tf_out[0]
taggram = np.concatenate((taggram, np.array(predicted_tags)), axis=0)
sess.close()
print('done!')
if extract_features:
return taggram, labels, features
else:
return taggram, labels