In [5]:
import librosa
import math
import numpy as np
import decimal
from scipy.signal import lfilter, butter

In [6]:
window   = [0, 1]
fs       = 16000
Tw       = 25
Ts       = 10         
alpha    = 0.97
R        = []
M        = 40
C        = []        
L        = []    

buckets = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [7]:
def load_wav(filename, sample_rate):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()
    return audio

In [60]:
def load_wav1(filename, start, end, sample_rate):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()
    return audio[start:end]

In [49]:
audio = load_wav("E:/condaDev/matconvnet/contrib/VGGVox/testfiles/verif/8jEAjG6SegY_0000008.wav", 16000)

In [50]:
audio.shape

(108801,)

In [52]:
librosa.get_duration(audio, sr=16000)#  get_duration s

6.8000625

In [10]:
def nextpow2(x):  
    return 1 if x == 0 else 2**(x - 1).bit_length()

In [11]:
def hz2mel(hz):
    """Convert a value in Hertz to Mels
    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
    https://github.com/jameslyons/python_speech_features/blob/master/python_speech_features/base.py
    """
    #return 2595 * numpy.log10(1+hz/700.)
    return 1127 * numpy.log(1+hz/700.)

In [12]:
def mel2hz(mel):
    """Convert a value in Mels to Hertz
    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
    """
    #return 700*(10**(mel/2595.0)-1)
    return 700*(exp**(mel/1127.0)-1)

In [13]:
def lifter(cepstra, L=22):
    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
    magnitude of the high frequency DCT coeffs.
    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
    """
    if L > 0:
        nframes,ncoeff = numpy.shape(cepstra)
        n = numpy.arange(ncoeff)
        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
        return lift*cepstra
    else:
        # values of L <= 0, do nothing
        return cepstra

In [14]:
def remove_dc_and_dither(sin, sample_rate):
    if sample_rate == 16e3:
        alpha = 0.9900
    elif sample_rate == 8e3:
        alpha = 0.999
    else:
        print("Sample rate must be 16kHz or 8kHz only")
        exit(1)
    sin = lfilter([1, -1], [1, -alpha], sin, axis=0)
    dither = np.random.random_sample(len(sin)) + np.random.random_sample(len(sin)) - 1
    #dither = np.random.random_sample(sin.shape) + np.random.random_sample(sin.shape) - 1
    #spow = np.std(dither)
    spow = np.std(sin)  #626.9666984589156
    sout = sin + 1e-6 * spow * dither
    #print(sin) the same to matlab
    #print(dither)
    return sout

In [15]:
def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))

In [16]:
def rolling_window(a, window, step=1):
    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]

In [17]:
def framesig(sig, frame_len, frame_step, winfunc=lambda x: np.ones((x,)), stride_trick=True):
    """Frame a signal into overlapping frames.
    :param sig: the audio signal to frame.
    :param frame_len: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
    :returns: an array of frames. Size is NUMFRAMES by frame_len.
    """
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) # LV

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = np.zeros((padlen - slen,))
    padsignal = np.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(
            np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = np.array(indices, dtype=np.int32)
        frames = padsignal[indices]
        win = np.tile(winfunc(frame_len), (numframes, 1))

    return frames * win

In [18]:
def runSpec( speech, window, fs,Tw, Ts, alpha, R, M, C, L ):
    N = C
    hamming = np.hamming
    SPEC = mfccspec(speech, fs, Tw, Ts, alpha, hamming, R, M, N, L )
    return SPEC

In [19]:
def mfccspec( speech, fs, Tw, Ts, alpha, window, R, M, N, L ):
    if max(abs(speech))<=1:
        speech = speech * 2**15
    
    Nw = round( 1E-3*Tw*fs ) #400
    Ns = round( 1E-3*Ts*fs ) #160
    
    nfft = 2**nextpow2(Nw)
    K = nfft/2+1
    speech = remove_dc_and_dither(speech, fs)
    #print(speech) #same to matlab
    speech = lfilter( [1, -alpha], 1, speech, axis=0 )
    #print(speech) #same to matlab
    frames = framesig(speech, frame_len=Nw, frame_step=Ns, winfunc=np.hamming)
    #print(frames.shape) # 679*400, however, matlab has 400*678
    #modify frames
    frames = frames[:-1,:]
    #print(frames.shape)
    #print(frames)
    fft = abs(np.fft.fft(frames, n=512))
    #print(fft.shape) #(678, 512)
    #print(fft) same to matlab, only dim anginst (512, 678)
    return fft

In [20]:
def normalize_frames(m, epsilon=1e-12):
    return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in m])

In [28]:
def getinput(image, window, fs,Tw, Ts, alpha, R, M, C, L, buckets):
    z = load_wav(image, fs)
    #print(z.shape) (108801,)
    #rand crop  
    SPEC = runSpec(z, window, fs,Tw, Ts, alpha, R, M, C, L)
    #print(SPEC.shape) #(678, 512)
    fft_norm = normalize_frames(SPEC.T)
    #print(fft_norm.shape) #(512, 10879), however, should be (512, 679)
    #print(fft_norm) #same, only a litter diff of numic
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1]-rsize)/2)
    #print(rsize, rstart) #same
    out = fft_norm[:, rstart-1:rstart+rsize-1] #numpy index from 0, matlab from 1
    
    return out

In [202]:
def getinput1(image):
    #z = load_wav(image, fs)
    
    tmp         = load_wav(image, fs)
    secs        = round(len(tmp) / fs)
    if secs>3:
        start       = np.random.randint(1, secs - 2)
        edn         = start + 4*fs
        z           = load_wav1(image, start, edn, fs)
    else:
        z = tmp
    
    #print(z.shape) (108801,)
    SPEC = runSpec(z, window, fs,Tw, Ts, alpha, R, M, C, L)
    #print(SPEC.shape) (10879, 512), however, should be (512, 679)
    fft_norm = normalize_frames(SPEC.T)
    #print(fft_norm.shape) (512, 10879), however, should be (512, 679)
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1]-rsize)/2)
    out = fft_norm[:, rstart-1:rstart+rsize-1]
    
    return out

In [29]:
dpath = "E:/condaDev/matconvnet/contrib/VGGVox/testfiles/verif/8jEAjG6SegY_0000008.wav" #6s
out = getinput(dpath, window, fs,Tw, Ts, alpha, R, M, C, L, buckets) #(512, 600)

  out = out_full[ind]


In [30]:
out

array([[ 0.44108236, -0.8016216 , -1.02536772, ..., -0.94148977,
         1.26221851,  0.70205146],
       [ 0.55967609,  0.18363559, -0.04929088, ..., -0.07682288,
        -0.04769165,  0.90833958],
       [ 1.18025216, -1.6052382 , -0.51399049, ...,  0.53235141,
         0.72272994,  1.68329833],
       ...,
       [ 0.3717399 , -1.34228945, -0.16782582, ...,  0.007528  ,
        -0.40360683,  1.0304803 ],
       [ 1.18025216, -1.6052382 , -0.51399049, ...,  0.53235141,
         0.72272994,  1.68329833],
       [ 0.55967609,  0.18363559, -0.04929088, ..., -0.07682288,
        -0.04769165,  0.90833958]])

In [31]:
out.shape

(512, 600)

In [101]:
dpath = "00017.m4a" #9s
out = getinput(dpath, window, fs,Tw, Ts, alpha, R, M, C, L, buckets) #(512, 900)

  out = out_full[ind]


In [33]:
out

array([[-0.783403  , -0.51954836, -0.58191498, ...,  0.79765069,
        -0.39024087, -0.64948959],
       [-0.75805685, -0.18497139, -0.84595793, ..., -0.22836417,
        -1.02695199, -0.59022806],
       [-0.9739787 , -1.11904487, -0.49457537, ..., -1.35856473,
        -1.13780135, -1.20666339],
       ...,
       [-1.13067814, -0.92721171, -0.55173385, ..., -1.33499884,
        -1.1755232 , -0.895696  ],
       [-0.9739787 , -1.11904487, -0.49457537, ..., -1.35856473,
        -1.13780135, -1.20666339],
       [-0.75805685, -0.18497139, -0.84595793, ..., -0.22836417,
        -1.02695199, -0.59022806]])

In [34]:
out.shape

(512, 900)

In [58]:
import keras
y = [2,3,4,6,7]
l = keras.utils.to_categorical(y, num_classes=8)

In [59]:
l

array([[0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [61]:
dpath

'00017.m4a'

In [64]:
out = getinput1(dpath)

  out = out_full[ind]


In [66]:
out.shape

(512, 300)

In [200]:
def inputt(image):
    
    tmp         = load_wav(image, fs)
    secs        = round(librosa.get_duration(tmp, fs))
    if secs>3:
        start       = np.random.randint(1, secs - 2)
        edn         = start + 3*fs + 300
        #z           = load_wav1(image, start, edn, fs)
        z          = tmp[start:edn]
    else:
        z = tmp
    SPEC = runSpec(z, window, fs,Tw, Ts, alpha, R, M, C, L)
    fft_norm = normalize_frames(SPEC.T)
    #rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    #rstart = int((fft_norm.shape[1]-rsize)/2)
    #out = fft_norm[:, rstart-1:rstart+rsize-1]
    out = fft_norm
    
    return out

In [203]:
import time
 
start = time.clock()

out = getinput1(dpath)
#print(round(len(load_wav(dpath, 16000)) / 16000))


elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 0.5682828002263705


  out = out_full[ind]


In [204]:
import time
 
start = time.clock()

#print(round(librosa.get_duration(load_wav(dpath, 16000), 16000)))
#out = getinput(dpath,window, fs,Tw, Ts, alpha, R, M, C, L, buckets)
out = inputt(dpath)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 0.1341517119008131


  out = out_full[ind]


In [122]:
tmp         = load_wav(dpath, fs)

In [123]:
tmp.shape

(151552,)

In [124]:
librosa.get_duration(tmp, fs)

9.472

In [125]:
151552/fs

9.472

In [131]:
np.random.randint(1, 7)

2

In [188]:
t = load_wav1(dpath, 2, 3*fs+300, fs)

In [189]:
len(t)

48298

In [190]:
len(t)/fs

3.018625

In [191]:
SPEC = runSpec(t, window, fs,Tw, Ts, alpha, R, M, C, L)

  out = out_full[ind]


In [192]:
SPEC.shape

(300, 512)

In [155]:
fft_norm = normalize_frames(SPEC.T)

In [156]:
fft_norm.shape

(512, 298)

In [208]:
fft_norm.transpose().shape

(298, 512)

In [6]:
import numpy as np

In [28]:
t = {'a':1,'b':3,'c':2,'d':3,'e':8,'f':6}

In [2]:
tt = t.values()

In [7]:
tt

dict_values([1, 3, 2, 3, 8, 6])

In [8]:
tl =np.array(list(tt))

In [9]:
tl

array([1, 3, 2, 3, 8, 6])

In [14]:
index = np.where(tl == tu[2])

In [15]:
index

(array([1, 3]),)

In [16]:
tl[index] = 2

In [17]:
tl

array([1, 2, 2, 2, 8, 6])

In [366]:
tl.sort()

In [11]:
tl

array([1, 3, 2, 3, 8, 6])

In [19]:
tu = np.unique(tl)

In [20]:
tu

array([1, 2, 6, 8])

In [None]:
def process_labels(labels):
    # labels processing, relabel the labels from min to max
    labels_values = list(labels.values())
    List_of_unique_labels = np.unique(labels_values)
    List_of_unique_labels.sort()
    
    new_labels = labels.copy()
    for i in range(len(labels_values)):
        index = np.where(List_of_unique_labels == labels_values[i])
        for k, v in labels.items():
            if v == labels_values[i]:
                new_labels[k] = index
    
    np.testing.assert_array_equal(len(List_of_unique_labels), int(max(list(new_labels.values()))[0])+1) #checking
                
    return new_labels

In [47]:
def process_labels1(labels):
    # labels processing, relabel the labels from min to max
    labels_values = np.array(list(labels.values()))
    List_of_unique_labels = np.unique(labels_values)
    List_of_unique_labels.sort()
    
    new_label_vals = labels_values.copy()
    for i in range(len(List_of_unique_labels)):
        index = np.where(new_label_vals == List_of_unique_labels[i])
        new_label_vals[index] = i
        
    new_labels = dict(zip(labels.keys(), new_label_vals))
    #print(max(list(new_labels.values())))
    np.testing.assert_array_equal(len(List_of_unique_labels), int(max(list(new_labels.values())))+1)
                
    return new_labels

In [48]:
res = process_labels1(t)
res

{'a': 0, 'b': 2, 'c': 1, 'd': 2, 'e': 4, 'f': 3}

In [21]:
t.keys()

dict_keys(['a', 'b', 'c', 'd', 'e', 'f'])

In [25]:
dictionary = dict(zip(t.keys(), res))

In [26]:
dictionary

{'a': 0, 'b': 2, 'c': 1, 'd': 2, 'e': 4, 'f': 3}

In [317]:
fw = open("test.txt",'w+')
fw.write(str(t))     
fw.close()

In [318]:
fr = open("test.txt",'r+')
dic = eval(fr.read()) 
print(dic)
fr.close()

{'a': 1, 'b': 3, 'c': 2, 'd': 3, 'e': 8, 'f': 6}
