# Query-by-Example (QbE) Search Using DTW

## Preliminary

In [8]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from __future__ import division
from __future__ import print_function
from os import path
from python_speech_features import delta
from python_speech_features import mfcc
import glob
import matplotlib.pyplot as plt
import numpy as np
import scipy.io.wavfile as wav
import sys
import simpleaudio as sa
from pydub import AudioSegment
from pydub.playback import play
from shennong.audio import Audio
from shennong.features.processor.mfcc import MfccProcessor
from shennong.features.postprocessor.delta import DeltaPostProcessor
from shennong.features.processor.plp import PlpProcessor
from shennong.features.postprocessor.cmvn import CmvnPostProcessor

sys.path.append("..")
sys.path.append(path.join("..", "utils"))

from speech_dtw import qbe

## Extract features

In [10]:
def get_mfcc_dd(wav_fn, cmvn=True):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    audio = Audio.load(wav_fn)
    processor = MfccProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01,
                              cepstral_lifter=26.0,low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2)
    d_processor = DeltaPostProcessor(order=2)
    mfcc_static = processor.process(audio, vtln_warp=1.0)
    mfcc_deltas = d_processor.process(mfcc_static)
    features = np.float64(mfcc_deltas._to_dict()["data"])

    if cmvn:
        features = (features - np.mean(features, axis=0)) / np.std(features, axis=0)
    return features

## Intermediate analysis

Align query to a search utterance from the same speaker containing the keyword.

## Take Common English Words

In [11]:
import json# Grab spoken lexicon and return word, crop and duration of the word
txt_file = open("english_words/common_words_100.txt", "r")
word_dict = txt_file.read().split(' ')
common_word_list = []
i = 0
for wav_fn in sorted(glob.glob(path.join("english_words/common", "common_word_??.wav"))):
    rate, signal = wav.read(wav_fn)
    length = len(signal) / rate * 1000
    query_mfcc = get_mfcc_dd(wav_fn)
    query = {}
    print(word_dict[i], end=' ')
    query["length"] = length
    query["data"] = query_mfcc
    query["word"] = word_dict[i]
    query["audio"] = AudioSegment.from_file(wav_fn, format="wav")
    query["thres"] = 1
    #play(query["audio"])
    common_word_list.append(query)
    i+=1
    

NameError: name 'DeltaPostProcessor' is not defined

In [None]:
print(common_word_list[99]["word"])
play(common_word_list[99]["audio"])

In [None]:
rudd_original_mfcc_list = []
rudd_original_wav = []
for wav_fn in sorted(glob.glob(path.join("utterances", "rudd-apology_?.wav"))):
    print("Reading:", wav_fn)
    #original_wav.append(sa.WaveObject.from_wave_file(wav_fn))
    rudd_original_wav.append(AudioSegment.from_file(wav_fn, format="wav"))
    rudd_original_mfcc_list.append(get_mfcc_dd(wav_fn))
    
for wav_fn in sorted(glob.glob(path.join("utterances", "rudd-apology_??.wav"))):
    print("Reading:", wav_fn)
    #original_wav.append(sa.WaveObject.from_wave_file(wav_fn))
    rudd_original_wav.append(AudioSegment.from_file(wav_fn, format="wav"))
    rudd_original_mfcc_list.append(get_mfcc_dd(wav_fn))

In [None]:
sweep_original = qbe.dtw_sweep(common_word_list[0]['data'], rudd_original_mfcc_list[0])
plt.plot(sweep_original,color='green')
play(common_word_list[0]['audio'])
play(rudd_original_wav[0])