-
Notifications
You must be signed in to change notification settings - Fork 1
/
tool.py
313 lines (268 loc) · 12.2 KB
/
tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import tensorflow as tf
import glob
import os
import numpy as np
import scipy.io.wavfile as wav
import numpy as np
from scipy.fftpack import fft
from scipy.fftpack import ifft
from scipy import signal
import scipy
import librosa
import pydub
# Mean and Standard deviation of the spectrogram across our dataset
mean = 0.320085779840294
std_dev = 0.17540888436162386
def normalize(specs, tf = True):
specs = specs**0.3
if (not tf):
return (specs-np.mean(specs))/np.std(specs)
else:
return (specs-tf.reduce_mean(specs))/tf.reduce_std(specs)
def unnormalize(specs):
specs = specs * std_dev
specs = specs + mean
return specs**(1/0.3)
def load_data(data_dir='./val_19680_easy'):
tfrecord_files = glob.glob(os.path.join(data_dir, '*.tfrecord'))
tfrecord_file = tfrecord_files[0]
spec_h = 257 # 80
# set some variables that are relavant to the network
# network_example_length = 19840
# hop_length = 160
examples = []
specs = []
spec_labs = []
limit = 100
# for tfrecord_file in tfrecord_files_val:
for i, example in enumerate(tf.python_io.tf_record_iterator(tfrecord_file)):
if i < 20:
continue
eg_np = tf.train.Example.FromString(example)
audio_segment = pydub.AudioSegment(
eg_np.features.feature["audio"].bytes_list.value[0],
frame_rate=16000,
sample_width=2,
channels=1
)
y = audio_segment.get_array_of_samples()
examples.append(y)
spec = eg_np.features.feature["spectrogram"].float_list.value
spec = np.array(spec).reshape(spec_h,-1)
spec = spec**(1/0.3)
specs.append(spec)
spec_labs.append(eg_np.features.feature["spectrogram_label"].int64_list.value)
if i > limit:
break
examples = np.array(examples)
specs = np.array(specs, dtype=np.float32)
spec_labs = np.array(spec_labs)
# slice it into quarters (hop size of 31)
# this results in skipping labels
batched_input = []
labels = []
for i in range (specs.shape[0]):
for j in range(specs.shape[-1]//31):
start = 31 * j
end = 31 * (j+1)
batched_input.append(specs[i,:,start:end])
labels.append(spec_labs[i,start+20])
batched_input = np.array(batched_input)
labels = np.array(labels)
labels = labels + 1
# compute the masking threshold
th_batch = []
psd_max_batch = []
for i in range(batched_input.shape[0]):
th, psd_max = generate_th(batched_input[i], 16000)
th_batch.append(th.T)
psd_max_batch.append(psd_max)
th_batch = np.array(th_batch)
psd_max_batch = np.array(psd_max_batch)
return batched_input, labels, th_batch, psd_max_batch
"""
Perceptual Masking, copied from https://github.com/tensorflow/cleverhans/blob/master/examples/adversarial_asr,
implementation of the paper "Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech
Recognition" https://arxiv.org/pdf/1903.10346.pdf
"""
def compute_PSD_matrix(specs, window_size):
"""
First, perform STFT.
Then, compute the PSD.
Last, normalize PSD.
"""
win = np.sqrt(8.0/3.) * specs
z = abs(win / window_size)
psd_max = np.max(z*z)
psd = 10 * np.log10(z * z + 0.0000000000000000001)
PSD = 96 - np.max(psd) + psd
return PSD, psd_max
def Bark(f):
"""returns the bark-scale value for input frequency f (in Hz)"""
return 13*np.arctan(0.00076*f) + 3.5*np.arctan(pow(f/7500.0, 2))
def quiet(f):
"""returns threshold in quiet measured in SPL at frequency f with an offset 12(in Hz)"""
thresh = 3.64*pow(f*0.001,-0.8) - 6.5*np.exp(-0.6*pow(0.001*f-3.3,2)) + 0.001*pow(0.001*f,4) - 12
return thresh
def two_slops(bark_psd, delta_TM, bark_maskee):
"""
returns the masking threshold for each masker using two slopes as the spread function
"""
Ts = []
for tone_mask in range(bark_psd.shape[0]):
bark_masker = bark_psd[tone_mask, 0]
dz = bark_maskee - bark_masker
zero_index = np.argmax(dz > 0)
sf = np.zeros(len(dz))
sf[:zero_index] = 27 * dz[:zero_index]
sf[zero_index:] = (-27 + 0.37 * max(bark_psd[tone_mask, 1] - 40, 0)) * dz[zero_index:]
T = bark_psd[tone_mask, 1] + delta_TM[tone_mask] + sf
Ts.append(T)
return Ts
def compute_th(PSD, barks, ATH, freqs):
""" returns the global masking threshold
"""
# Identification of tonal maskers
# find the index of maskers that are the local maxima
length = len(PSD)
masker_index = signal.argrelextrema(PSD, np.greater)[0]
# delete the boundary of maskers for smoothing
if 0 in masker_index:
masker_index = np.delete(0)
if length - 1 in masker_index:
masker_index = np.delete(length - 1)
num_local_max = len(masker_index)
# treat all the maskers as tonal (conservative way)
# smooth the PSD
p_k = pow(10, PSD[masker_index]/10.)
p_k_prev = pow(10, PSD[masker_index - 1]/10.)
p_k_post = pow(10, PSD[masker_index + 1]/10.)
P_TM = 10 * np.log10(p_k_prev + p_k + p_k_post)
# bark_psd: the first column bark, the second column: P_TM, the third column: the index of points
_BARK = 0
_PSD = 1
_INDEX = 2
bark_psd = np.zeros([num_local_max, 3])
bark_psd[:, _BARK] = barks[masker_index]
bark_psd[:, _PSD] = P_TM
bark_psd[:, _INDEX] = masker_index
# delete the masker that doesn't have the highest PSD within 0.5 Bark around its frequency
for i in range(num_local_max):
next = i + 1
if next >= bark_psd.shape[0]:
break
while bark_psd[next, _BARK] - bark_psd[i, _BARK] < 0.5:
# masker must be higher than quiet threshold
if quiet(freqs[int(bark_psd[i, _INDEX])]) > bark_psd[i, _PSD]:
bark_psd = np.delete(bark_psd, (i), axis=0)
if next == bark_psd.shape[0]:
break
if bark_psd[i, _PSD] < bark_psd[next, _PSD]:
bark_psd = np.delete(bark_psd, (i), axis=0)
else:
bark_psd = np.delete(bark_psd, (next), axis=0)
if next == bark_psd.shape[0]:
break
# compute the individual masking threshold
delta_TM = 1 * (-6.025 -0.275 * bark_psd[:, 0])
Ts = two_slops(bark_psd, delta_TM, barks)
Ts = np.array(Ts)
# compute the global masking threshold
theta_x = np.sum(pow(10, Ts/10.), axis=0) + pow(10, ATH/10.)
return theta_x
def generate_th(specs, fs, window_size=400):
"""
returns the masking threshold theta_xs and the max psd
"""
PSD, psd_max= compute_PSD_matrix(specs, window_size)
freqs = librosa.core.fft_frequencies(fs, n_fft=512)
barks = Bark(freqs)
# compute the quiet threshold
ATH = np.zeros(len(barks)) - np.inf
bark_ind = np.argmax(barks > 1)
ATH[bark_ind:] = quiet(freqs[bark_ind:])
# compute the global masking threshold theta_xs
theta_xs = []
# compute the global masking threshold in each window
for i in range(PSD.shape[1]):
theta_xs.append(compute_th(PSD[:,i], barks, ATH, freqs))
theta_xs = np.array(theta_xs)
return theta_xs, psd_max
class Transform(object):
'''
Return: PSD
'''
def __init__(self, window_size):
self.scale = 8. / 3.
self.frame_length = int(window_size)
self.frame_step = int(window_size//4)
self.window_size = window_size
def __call__(self, x, psd_max_ori):
z = self.scale *tf.abs(x / self.window_size)
psd = tf.square(z)
PSD = tf.pow(10., 9.6) / tf.reshape(psd_max_ori, [-1, 1, 1]) * psd
return PSD
### Tensorflow implementation of stft ###
def tf_log10(x):
numerator = tf.math.log(x)
denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
return numerator / denominator
def power_to_db(S, ref = 1.0, amin=1e-16, top_db=80.0):
"""Convert a power-spectrogram (magnitude squared) to decibel (dB) units.
based on http://man.hubwiz.com/docset/LibROSA.docset/Contents/Resources/Documents/generated/librosa.core.power_to_db.html
"""
log_spec = 10.0 * tf_log10(tf.maximum(amin, S))
log_spec -= 10.0 * tf_log10(tf.maximum(amin, ref))
log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
return log_spec
def db_to_power(S_db):
"""Convert a dB-scale spectrogram to a power spectrogram.
based on http://man.hubwiz.com/docset/LibROSA.docset/Contents/Resources/Documents/_modules/librosa/core/spectrum.html#db_to_power
"""
return tf.math.pow(10.0, 0.1 * S_db)
def tf_audio2spec(audio, win_length=400, hop_length = 160, n_fft = 512):
"""
Input: tensor that contains input audio, which has shape (batch_size, audio_length)
Ouput: spectrogram representation of the input tensor, which is of shape (batch_size, height, width)
"""
#audio = tf.pad(audio, ((n_fft//2, n_fft//2),(n_fft//2, n_fft//2)), "REFLECT")
audio = tf.pad(audio, ((0, 0),(n_fft//2, n_fft//2)), "REFLECT")
spec = tf.math.abs(tf.signal.stft(signals = audio, frame_length=win_length, frame_step=hop_length, fft_length=n_fft))
spec_db = power_to_db(spec**2)
spec_db = tf.clip_by_value(spec_db, -55, 65)
spec = db_to_power(spec_db)**0.5
# Stevens's power law for loudness
spec = spec**0.3
spec = tf.transpose(spec)
return spec
def warm_start(warm_start_from = "./1594432670"):
warm_starts = sorted(glob.glob(warm_start_from))
warm_start_from = warm_starts[-1] if (warm_starts is not None and len(warm_starts)) else None
warm_start_from = os.path.join(warm_start_from, 'variables/variables')
id_assignment_map = {'CBHBH/prenet/dense/': 'CBHBH/prenet/dense/',
'CBHBH/prenet/dense_1/': 'CBHBH/prenet/dense_1/',
'CBHBH/highwaynet_featextractor_0/dense1/': 'CBHBH/highwaynet_featextractor_0/dense1/',
'CBHBH/highwaynet_featextractor_0/dense2/': 'CBHBH/highwaynet_featextractor_0/dense2/',
'CBHBH/highwaynet_featextractor_1/dense1/': 'CBHBH/highwaynet_featextractor_1/dense1/',
'CBHBH/highwaynet_featextractor_1/dense2/': 'CBHBH/highwaynet_featextractor_1/dense2/',
'CBHBH/highwaynet_featextractor_2/dense1/': 'CBHBH/highwaynet_featextractor_2/dense1/',
'CBHBH/highwaynet_featextractor_2/dense2/': 'CBHBH/highwaynet_featextractor_2/dense2/',
'CBHBH/highwaynet_featextractor_3/dense1/': 'CBHBH/highwaynet_featextractor_3/dense1/',
'CBHBH/highwaynet_featextractor_3/dense2/': 'CBHBH/highwaynet_featextractor_3/dense2/',
'CBHBH/bottleneck/': 'CBHBH/bottleneck/',
'CBHBH/highwaynet_classifier_0/dense1/': 'CBHBH/highwaynet_classifier_0/dense1/',
'CBHBH/highwaynet_classifier_0/dense2/': 'CBHBH/highwaynet_classifier_0/dense2/',
'CBHBH/highwaynet_classifier_1/dense1/': 'CBHBH/highwaynet_classifier_1/dense1/',
'CBHBH/highwaynet_classifier_1/dense2/': 'CBHBH/highwaynet_classifier_1/dense2/',
'CBHBH/highwaynet_classifier_2/dense1/': 'CBHBH/highwaynet_classifier_2/dense1/',
'CBHBH/highwaynet_classifier_2/dense2/': 'CBHBH/highwaynet_classifier_2/dense2/',
'CBHBH/highwaynet_classifier_3/dense1/': 'CBHBH/highwaynet_classifier_3/dense1/',
'CBHBH/highwaynet_classifier_3/dense2/': 'CBHBH/highwaynet_classifier_3/dense2/',
'CBHBH/highwaynet_classifier_4/dense1/': 'CBHBH/highwaynet_classifier_4/dense1/',
'CBHBH/highwaynet_classifier_4/dense2/': 'CBHBH/highwaynet_classifier_4/dense2/',
'CBHBH/highwaynet_classifier_5/dense1/': 'CBHBH/highwaynet_classifier_5/dense1/',
'CBHBH/highwaynet_classifier_5/dense2/': 'CBHBH/highwaynet_classifier_5/dense2/',
'CBHBH/dense/': 'CBHBH/dense/'}
return warm_start_from, id_assignment_map
def accuracy(logits, labels):
return np.sum(np.argmax(logits, axis=-1) == np.argmax(labels, axis=-1))/logits.shape[0]