In [None]:
import sys
import json
import numpy as np
import scipy.io.wavfile as wf
from sys import exit
from IPython.display import Audio

In [None]:
path_sample = '/content/drive/My Drive/Colab Notebooks/PG/data/female/1.wav' #input("WAV path: ")
path_templates = '/content/templates.txt' #input("Templates path: ")

ms_units = True #True if input("Use milliseconds as measurement units (yes, no): ") == 'yes' else False

p = 50 #int(input("Endpointing P: "))
q = 10 #int(input("Endpointing Q: "))
noise_len = 100 #float(input("Noise length: "))

coef_fun = 'lpcc' #input("Coefficients (lpcc, mfcc): ")

lpcc_len = 13 #int(input("LPCC length: "))

window_fun = 'hanning' #input("Windowing function (hanning, hamming, none): ")
window_len = 10 #int(input("Window length: "))
hop_size = 10 #int(input("Hop size: "))
filter_num = 10 #int(input("Total filters: "))
mfcc_len = 13 #int(input("MFCC length: "))
mfcc_delta = True #True if input("Use MFCC delta parameters (yes, no): ") == 'yes' else False
mfcc_tau = 3 #int(input("MFCC tau: "))

In [None]:
def load_wav(path):
    global noise_len, window_len, hop_size

    rate, data = wf.read(path)
    
    if ms_units:
        noise_len = ms_to_samples(noise_len, rate)
        window_len = ms_to_samples(window_len, rate)
        hop_size = ms_to_samples(hop_size, rate)
    
    return data, rate


def play_wav(data, rate):
    display(Audio(data, rate=rate))


def load_templates(path):
    try:
        with open(path, 'r') as fd:
            return json.load(fd)
    except FileNotFoundError:
        return []


def save_templates(path, templates):
    with open(path, 'w') as fd:
        json.dump(templates, fd, indent=4)


def ms_to_samples(value, rate):
    return rate * value // 1000


def create_scaffolds(windows, curr, sub, length):
    i, lmt = 0, len(windows)
    
    if windows[-1] != sub:
        windows.append(sub)
        lmt -= 1
    
    scaffolds = []

    while i < lmt:
        if windows[i] == curr:
            j = windows.index(sub, i + 1)
            if j - i < length:
                windows[i:j] = [sub] * (j - i)
            else:
                scaffolds.append((i, j))
            i = j
        i += 1
    
    return scaffolds


def apply_slicing(data, rate, window_len, hop_size):
    # TODO why use line below
    data = np.pad(data, window_len // 2, mode='reflect')
    return np.array([data[i:i+window_len] for i in range(0, len(data), hop_size)])


def apply_endpointing(data, rate, window_len, p, q):
    noise = np.abs(data[:noise_len])
    lmt = np.mean(noise) + 2 * np.std(noise)

    windows = [1 if np.mean(np.abs(data[i:i+window_len])) > lmt else 0 for i in range(0, len(data), window_len)]

    create_scaffolds(windows, 0, 1, p)
    scaffolds = create_scaffolds(windows, 1, 0, q)
    
    if len(scaffolds) == 0:
        sys.exit("Error: Only silence detected")
    
    return [data[i*window_len:j*window_len] for i, j in scaffolds]


def apply_windowing(data, rate, fun, size):
    funs = {'hanning': np.hanning, 'hamming': np.hamming, 'none': np.ones}

    for i in range(data.shape[0]):
        data[i] *= funs[fun](data[i].shape[0])

    return data


def apply_dft(data, window_len):
    data_dft = np.empty((1 + window_len // 2, data.shape[0]), dtype=np.complex64)

    for i in range(data_dft.shape[1]):
        data_dft[:, i] = np.fft.fft(data[i])[:data_dft.shape[0]]

    return np.square(np.abs(data_dft))


def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)


def mel_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)


def get_filter_points(rate, filter_num, window_len):
    mel_min, mel_max = freq_to_mel(0), freq_to_mel(rate // 2)

    freqs = mel_to_freq(np.linspace(mel_min, mel_max, filter_num + 2))
    points = np.floor((window_len + 1) / rate * freqs).astype(int)

    return points, freqs


def get_filters(filter_points, window_len):
    filter_num = filter_points.shape[0] - 2
    filters = np.zeros((filter_num, window_len // 2 + 1))

    for i in range(filter_num):
        prev, curr, next = filter_points[i], filter_points[i+1], filter_points[i+2]
        filters[i, prev:curr] = np.linspace(0, 1, curr - prev)
        filters[i, curr:next] = np.linspace(1, 0, next - curr)

    return filters


def apply_filters(data, filter_num, window_len):
    filter_points, mel_freqs = get_filter_points(rate, filter_num, window_len)
    filters = get_filters(filter_points, window_len)

    enorm = 2.0 / (mel_freqs[2:filter_num+2] - mel_freqs[:filter_num])
    filters *= enorm[:, np.newaxis]

    return np.dot(filters, data)


def apply_dct(data, filter_num, filter_len):
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    filters = np.empty((filter_num, filter_len))
    filters[0] = 1.0 / np.sqrt(filter_len)
    filters[1:] = [np.cos(i * samples) * np.sqrt(2.0 / filter_len) for i in range(1, filter_num)]

    return np.dot(filters, 10.0 * np.log10(data))


def get_delta(data, tau):
    delta = np.empty(data.shape)

    # TODO idi u krug bez paddinga
    for q in range(data.shape[0]):
        dc = np.zeros(data.shape[1] + 2 * tau)
        for m in range(tau, data.shape[1] - tau):
            dc[m] = data[q][m+tau] - data[q][m-tau]
        delta[q] = dc[tau:dc.shape[0]-tau]

    return delta


def get_lpcc(data, rate, lpcc_len):
    data = data / np.max(np.abs(data))

    data_slices = apply_slicing(data, rate, window_len, hop_size)
    data_windows = apply_windowing(data_slices, rate, window_fun, window_len)

    lpcc = np.empty((data_windows.shape[0], lpcc_len))

    for i, w in enumerate(data_windows):
        r = [sum(s[n] * s[n-k] for n in range(k, w.shape[0])) for k in range(lpcc_len)]
        r_mat = [[r[abs(i-j)] for i in range(lpcc_len)] for j in range(lpcc_len)]
        lpcc[i] = np.dot(np.linalg.inv(r_mat), r)

    return lpcc


def get_mfcc(data, rate, mfcc_len, filter_num, add_delta, tau):
    data = data / np.max(np.abs(data))

    data_slices = apply_slicing(data, rate, window_len, hop_size)
    data_windows = apply_windowing(data_slices, rate, window_fun, window_len)
    data_dft = apply_dft(data_windows, window_len)
    data_filtered = apply_filters(data_dft, filter_num, window_len)
    mfcc = apply_dct(data_filtered, mfcc_len, filter_num)

    if add_delta:
        delta1 = get_delta(mfcc, tau)
        mfcc = np.append(mfcc, delta1, axis=0)
        delta2 = get_delta(delta1, tau)
        mfcc = np.append(mfcc, delta2, axis=0)

    return np.transpose(mfcc)


def apply_coding(data, rate, coef):
    if coef == 'lpcc':
        return get_lpcc(data, rate, lpcc_len)
    elif coef == 'mfcc':
        return get_mfcc(data, rate, mfcc_len, filter_num, mfcc_delta, mfcc_tau)
    else:
        sys.exit("Error: Invalid coefficients")


def apply_dtw(s, t):
    n, m = s.shape[0], t.shape[0]

    dtw = np.full((n, m), np.inf)
    dtw[0, 0] = 0
    
    for i in range(1, n):
        for j in range(1, m):
            curr = np.linalg.norm(s[i] - t[j])
            prev = np.min([dtw[i-1, j], dtw[i, j-1], dtw[i-1, j-1]])
            dtw[i, j] = prev + curr

    return dtw[n-1, m-1]

In [None]:
templates = load_templates(path_templates)
data, rate = load_wav(path_sample)
samples = apply_endpointing(data, rate, window_len, p, q)

for i, s in enumerate(samples):
    play_wav(s, rate)

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
for i, s in enumerate(samples):
    name = input("Sample #{} name: ".format(i+1))
    lpcc, mfcc = apply_coding(s, rate, 'lpcc').tolist(), apply_coding(s, rate, 'mfcc').tolist()
    templates.append({'name': name, 'lpcc': lpcc, 'mfcc': mfcc})

save_templates(path_templates, templates)

Sample #1 name: river




Sample #2 name: imp
Sample #3 name: guitaaa


In [None]:
templates = load_templates(path_templates)
data, rate = load_wav(path_sample)
samples = apply_endpointing(data, rate, window_len, p, q)

for i, s in enumerate(samples):
    coefs = apply_coding(s, rate, coef_fun)
    matches = sorted([(t['name'], apply_dtw(coefs, np.array(t[coef_fun]))) for t in templates], key = lambda x: x[1])
    print("Sample #{}".format(i+1))
    for t, dtw in matches:
        if dtw > 0:
            print("Match: ({})\t{}".format(dtw, t))



Sample #1
Match: (7.89869866932439e-13)	river
Match: (7.89869866932439e-13)	river
Match: (7.938020838961148e-13)	river
Match: (7.938020838961148e-13)	river
Match: (1.440823457050551e-12)	river
Match: (1.584771342469163e-12)	manufacturer
Match: (2.23137097481238e-12)	river
Match: (2.23137097481238e-12)	river
Match: (2.23137097481238e-12)	river
Match: (2.23137097481238e-12)	river
Match: (2.23137097481238e-12)	river
Match: (2.2962435536056968e-12)	imp
Match: (2.689073620701884e-12)	guitaaa
Match: (2.7238742876096513e-12)	river
Match: (2.7238742876096513e-12)	guitar
Match: (3.023777651241763e-12)	river
Match: (3.023777651241763e-12)	impression
Match: (3.2928950470420855e-12)	river
Match: (3.2928950470420855e-12)	river
Match: (3.872654225087584e-12)	pollution
Match: (4.480337354070901e-12)	possesion
