-
Notifications
You must be signed in to change notification settings - Fork 125
/
segmenter.py
386 lines (309 loc) · 13.6 KB
/
segmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#!/usr/bin/env python
# encoding: utf-8
# The MIT License
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import os
import sys
import numpy as np
from tensorflow import keras
from tensorflow.keras.utils import get_file
from .thread_returning import ThreadReturning
import shutil
import time
import random
import gc
from skimage.util import view_as_windows as vaw
from .pyannote_viterbi import viterbi_decoding
from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
from .remote_utils import get_remote
from .io import media2sig16kmono
from .sidekit_mfcc import mfcc
import warnings
from .export_funcs import seg2csv, seg2textgrid
def _media2feats(medianame, tmpdir, start_sec, stop_sec, ffmpeg):
sig = media2sig16kmono(medianame, tmpdir, start_sec, stop_sec, ffmpeg, 'float32')
with warnings.catch_warnings():
# ignore warnings resulting from empty signals parts
warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning)
_, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True)
# Management of short duration segments
difflen = 0
if len(loge) < 68:
difflen = 68 - len(loge)
warnings.warn("media %s duration is short. Robust results require length of at least 720 milliseconds" % medianame)
mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
return mspec, loge, difflen
def _energy_activity(loge, ratio):
threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
raw_activity = (loge > threshold)
return viterbi_decoding(pred2logemission(raw_activity),
log_trans_exp(150, cost0=-5))
def _get_patches(mspec, w, step):
h = mspec.shape[1]
data = vaw(mspec, (w,h), step=step)
data.shape = (len(data), w*h)
data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
data = np.vstack(lfill + [data] + rfill )
finite = np.all(np.isfinite(data), axis=1)
data.shape = (len(data), w, h)
return data, finite
def _binidx2seglist(binidx):
"""
ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
#TODO: is there a pandas alternative??
"""
curlabel = None
bseg = -1
ret = []
for i, e in enumerate(binidx):
if e != curlabel:
if curlabel is not None:
ret.append((curlabel, bseg, i))
curlabel = e
bseg = i
ret.append((curlabel, bseg, i + 1))
return ret
class DnnSegmenter:
"""
DnnSegmenter is an abstract class allowing to perform Dnn-based
segmentation using Keras serialized models using 24 mel spectrogram
features obtained with SIDEKIT framework.
Child classes MUST define the following class attributes:
* nmel: the number of mel bands to used (max: 24)
* viterbi_arg: the argument to be used with viterbi post-processing
* model_fname: the filename of the serialized keras model to be used
the model should be stored in the current directory
* inlabel: only segments with label name inlabel will be analyzed.
other labels will stay unchanged
* outlabels: the labels associated the output of neural network models
"""
def __init__(self, batch_size):
# load the DNN model
model_path = get_remote(self.model_fname)
self.nn = keras.models.load_model(model_path, compile=False)
self.nn.run_eagerly = False
self.batch_size = batch_size
def __call__(self, mspec, lseg, difflen = 0):
"""
*** input
* mspec: mel spectrogram
* lseg: list of tuples (label, start, stop) corresponding to previous segmentations
* difflen: 0 if the original length of the mel spectrogram is >= 68
otherwise it is set to 68 - length(mspec)
*** output
a list of adjacent tuples (label, start, stop)
"""
if self.nmel < 24:
mspec = mspec[:, :self.nmel].copy()
patches, finite = _get_patches(mspec, 68, 2)
if difflen > 0:
patches = patches[:-int(difflen / 2), :, :]
finite = finite[:-int(difflen / 2)]
assert len(finite) == len(patches), (len(patches), len(finite))
batch = []
for lab, start, stop in lseg:
if lab == self.inlabel:
batch.append(patches[start:stop, :])
if len(batch) > 0:
batch = np.expand_dims(np.concatenate(batch), 3)
rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=2)
gc.collect()
ret = []
for lab, start, stop in lseg:
if lab != self.inlabel:
ret.append((lab, start, stop))
continue
l = stop - start
r = rawpred[:l]
rawpred = rawpred[l:]
r[finite[start:stop] == False, :] = 0.5
pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
for lab2, start2, stop2 in _binidx2seglist(pred):
ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
return ret
class SpeechMusic(DnnSegmenter):
# Voice activity detection: requires energetic activity detection
outlabels = ('speech', 'music')
model_fname = 'keras_speech_music_cnn.hdf5'
inlabel = 'energy'
nmel = 21
viterbi_arg = 150
class SpeechMusicNoise(DnnSegmenter):
# Voice activity detection: requires energetic activity detection
outlabels = ('speech', 'music', 'noise')
model_fname = 'keras_speech_music_noise_cnn.hdf5'
inlabel = 'energy'
nmel = 21
viterbi_arg = 80
class Gender(DnnSegmenter):
# Gender Segmentation, requires voice activity detection
outlabels = ('female', 'male')
model_fname = 'keras_male_female_cnn.hdf5'
inlabel = 'speech'
nmel = 24
viterbi_arg = 80
class Segmenter:
def __init__(self, vad_engine='smn', detect_gender=True, ffmpeg='ffmpeg', batch_size=32, energy_ratio=0.03):
"""
Load neural network models
Input:
'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
'sm' was used in the results presented in ICASSP 2017 paper
and in MIREX 2018 challenge submission
'smn' has been implemented more recently and has not been evaluated in papers
'detect_gender': if False, speech excerpts are return labelled as 'speech'
if True, speech excerpts are splitted into 'male' and 'female' segments
'batch_size' : large values of batch_size (ex: 1024) allow faster processing times.
They also require more memory on the GPU.
default value (32) is slow, but works on any hardware
"""
# test ffmpeg installation
if shutil.which(ffmpeg) is None:
raise(Exception("""ffmpeg program not found"""))
self.ffmpeg = ffmpeg
# set energic ratio for 1st VAD
self.energy_ratio = energy_ratio
# select speech/music or speech/music/noise voice activity detection engine
assert vad_engine in ['sm', 'smn']
if vad_engine == 'sm':
self.vad = SpeechMusic(batch_size)
elif vad_engine == 'smn':
self.vad = SpeechMusicNoise(batch_size)
# load gender detection NN if required
assert detect_gender in [True, False]
self.detect_gender = detect_gender
if detect_gender:
self.gender = Gender(batch_size)
def segment_feats(self, mspec, loge, difflen, start_sec):
"""
do segmentation
require input corresponding to wav file sampled at 16000Hz
with a single channel
"""
# perform energy-based activity detection
lseg = []
for lab, start, stop in _binidx2seglist(_energy_activity(loge, self.energy_ratio)[::2]):
if lab == 0:
lab = 'noEnergy'
else:
lab = 'energy'
lseg.append((lab, start, stop))
# perform voice activity detection
lseg = self.vad(mspec, lseg, difflen)
# perform gender segmentation on speech segments
if self.detect_gender:
lseg = self.gender(mspec, lseg, difflen)
return [(lab, start_sec + start * .02, start_sec + stop * .02) for lab, start, stop in lseg]
def __call__(self, medianame, tmpdir=None, start_sec=None, stop_sec=None):
"""
Return segmentation of a given file
* convert file to wav 16k mono with ffmpeg
* call NN segmentation procedures
* media_name: path to the media to be processed (including remote url)
may include any format supported by ffmpeg
* tmpdir: allow to define a custom path for storing temporary files
fast read/write HD are a good choice
* start_sec (seconds): sound stream before start_sec won't be processed
* stop_sec (seconds): sound stream after stop_sec won't be processed
"""
mspec, loge, difflen = _media2feats(medianame, tmpdir, start_sec, stop_sec, self.ffmpeg)
if start_sec is None:
start_sec = 0
# do segmentation
return self.segment_feats(mspec, loge, difflen, start_sec)
def batch_process(self, linput, loutput, tmpdir=None, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
if verbose:
print('batch_processing %d files' % len(linput))
if output_format == 'csv':
fexport = seg2csv
elif output_format == 'textgrid':
fexport = seg2textgrid
else:
raise NotImplementedError()
t_batch_start = time.time()
lmsg = []
fg = featGenerator(linput.copy(), loutput.copy(), tmpdir, self.ffmpeg, skipifexist, nbtry, trydelay)
i = 0
for feats, msg in fg:
lmsg += msg
i += len(msg)
if verbose:
print('%d/%d' % (i, len(linput)), msg)
if feats is None:
break
mspec, loge, difflen = feats
#if verbose == True:
# print(i, linput[i], loutput[i])
b = time.time()
lseg = self.segment_feats(mspec, loge, difflen, 0)
fexport(lseg, loutput[len(lmsg) -1])
lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
t_batch_dur = time.time() - t_batch_start
nb_processed = len([e for e in lmsg if e[1] == 0])
if nb_processed > 0:
avg = t_batch_dur / nb_processed
else:
avg = -1
return t_batch_dur, nb_processed, avg, lmsg
def medialist2feats(lin, lout, tmpdir, ffmpeg, skipifexist, nbtry, trydelay):
"""
To be used when processing batches
if resulting file exists, it is skipped
in case of remote files, access is tried nbtry times
"""
ret = None
msg = []
while ret is None and len(lin) > 0:
src = lin.pop(0)
dst = lout.pop(0)
# if file exists: skipp
if skipifexist and os.path.exists(dst):
msg.append((dst, 1, 'already exists'))
continue
# create storing directory if required
dname = os.path.dirname(dst)
if not os.path.isdir(dname):
os.makedirs(dname)
itry = 0
while ret is None and itry < nbtry:
try:
ret = _media2feats(src, tmpdir, None, None, ffmpeg)
except:
itry += 1
errmsg = sys.exc_info()[0]
if itry != nbtry:
time.sleep(random.random() * trydelay)
if ret is None:
msg.append((dst, 2, 'error: ' + str(errmsg)))
else:
msg.append((dst, 0, 'ok'))
return ret, msg
def featGenerator(ilist, olist, tmpdir=None, ffmpeg='ffmpeg', skipifexist=False, nbtry=1, trydelay=2.):
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, tmpdir, ffmpeg, skipifexist, nbtry, trydelay])
thread.start()
while True:
ret, msg = thread.join()
if len(ilist) == 0:
break
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, tmpdir, ffmpeg, skipifexist, nbtry, trydelay])
thread.start()
yield ret, msg
yield ret, msg