# Speech recognition

Simple speech recognition system can be implemented using DTW + MFCC.
based on: https://github.com/pierre-rouanet/dtw/blob/master/examples/speech-recognition.ipynb 

In [60]:
import os
import glob
import librosa
import librosa.display
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.fftpack import dct
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt
import python_speech_features

%matplotlib inline

We will use the [data-speech-commands database](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz) composed of 105,000 WAVE audio files of people saying thirty different words. We will use only a subset of this database.

We assume that you have previously downloaded and extracted the database. You need to specify the path to the folder where you extracted it.

In [61]:
DATABASE_PATH = '/home/joao/Desktop/datasets/data_speech_commands_v0.02'

In [62]:
labels = {'cat', 'dog', 'house', 'happy', 'zero'}
labels

{'cat', 'dog', 'happy', 'house', 'zero'}

## Define MFCC

In [64]:
def alt_mfcc(audio, sr, n_fft=2048, hop_length=512, n_mels=128, num_ceps=13, cep_lifter=22):
    ##### Getting melspectrogram #####
    fft_windows = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    magnitude = np.abs(fft_windows)**2
    mel_filter_banks = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
    melspectrogram = mel_filter_banks.dot(magnitude)
    ##### Getting the MFCC #####
    melspectrogram_db = librosa.power_to_db(melspectrogram)
    mfcc = dct(melspectrogram_db, type=2, axis=0, norm='ortho')[:num_ceps]
    if cep_lifter > 0:
        nframes,ncoeff = np.shape(mfcc)
        n = np.arange(ncoeff)
        cep_lifter = 1 + (cep_lifter/2.)*np.sin(np.pi*n/cep_lifter)
        mfcc = cep_lifter*mfcc 
        # mfcc *= (1 + (cep_lifter / 2)* np.sin(np.pi * np.arange(1, 1 + n_mels, dtype=mfcc.dtype) / cep_lifter)[:, np.newaxis])
        return mfcc
    elif cep_lifter == 0:
        return mfcc

## Precompute all MFCCs

In [65]:
# We will use only N occurences per word
N = 25

mfccs = []
true_labels = []

for l in labels:
    sounds = glob.glob(os.path.join(DATABASE_PATH, l, '*.wav'))
    np.random.shuffle(sounds)
    sounds = sounds[:N]

    for s in sounds:    
        y, sr = librosa.load(s)
        mfcc = alt_mfcc(y, sr, num_ceps=13, cep_lifter=22)
        # mfcc = librosa.feature.mfcc(y, sr, n_mfcc=13, lifter=22)
        mfccs.append(mfcc.T)
        true_labels.append(l)
        
mfccs = np.array(mfccs)
true_labels = np.array(true_labels)


## Optional test

In [66]:
# a = alt_mfcc(y, sr, num_ceps=20, cep_lifter=0)
# b = librosa.feature.mfcc(y,sr, n_mfcc=20, lifter=0)
# assert (a == b).all()

# plt.figure(figsize=(25, 10))
# librosa.display.specshow(b, 
#                          x_axis="time", 
#                          sr=sr)
# plt.colorbar(format="%+2.f")
# plt.show()

## Prepare train/val dataset

In [67]:
val_percent = 0.2
n_val = int(val_percent * len(true_labels))

I = np.random.permutation(len(true_labels))
I_val, I_train = I[:n_val], I[n_val:]

## Leave P Out Cross Validation with DTW

In [68]:
from dtw import dtw

def cross_validation(train_indices, val_indices):
    score = 0.0

    for i in val_indices:
        x = mfccs[i]

        dmin, jmin = np.inf, -1
        for j in train_indices:
            y = mfccs[j]
            d, _, _, _ = dtw(x, y, dist=lambda x, y: np.linalg.norm(x - y, ord=1))

            if d < dmin:
                dmin = d
                jmin = j

        score += 1.0 if (true_labels[i] == true_labels[jmin]) else 0.0
        
    return score / len(val_indices)

In [69]:
rec_rate = cross_validation(I_train, I_val)
print('Recognition rate {}%'.format(100. * rec_rate))

Recognition rate 60.0%
