In [1]:
import os
import math
import random
import time

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as nnF
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset,random_split
from ipywidgets import IntProgress
import scipy.io.wavfile
import librosa
from skimage.transform import resize
import scipy.misc
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import IPython.display as ipd
import wave
import pyaudio
from sklearn.metrics import accuracy_score
import sounddevice as sd
from IPython.display import clear_output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# Loading models

## 1.Gender and number recognition model

In [2]:
resnet = models.resnet34(pretrained=True)

In [3]:
class ModifiedResNet(nn.Module):
            def __init__(self):
                super(ModifiedResNet, self).__init__()
                self.features = nn.Sequential(
                    #remove last fully connected layer
                    *list(resnet.children())[:-1]
                )
                self.dense1 = nn.Linear(512,128)
                self.dense_number = nn.Linear(128,10)
                self.dense_gender = nn.Linear(128,1)
            def forward(self, x):
              # input shape = batch_size,n_channels,height,width
                x = self.features(x) # output shape = batch_size * 512 * 1 * 1
                x = x.view(-1,512)
                x = nn.ReLU()(self.dense1(x)) # ouput shape = batch_size * 128
                number_pred = nn.Softmax()(self.dense_number(x))
                gender_pred = nn.Sigmoid()(self.dense_gender(x))

                return [number_pred,gender_pred]

In [4]:
gender_number_recognition_model = ModifiedResNet()
gender_number_recognition_model = gender_number_recognition_model.to(device)

In [7]:
gender_number_recognition_model.load_state_dict(torch.load(".\\gender_recognition\\models\\ResNet\\best_model_both_gender_number_recognition",map_location=device))

<All keys matched successfully>

## 2. Trigger word detection model

In [8]:
class DetectionModel(nn.Module):
    """The CNN model"""
    def __init__(self):
        
        super(DetectionModel, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64,
                               kernel_size=(5, 5), bias=False,padding="same")

        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128,
                               kernel_size=(3, 3), bias=False,padding="same")

        self.conv3 = nn.Conv2d(in_channels=128, out_channels=128,
                               kernel_size=(3, 3), bias=False)

        self.fc1 = nn.Linear(128, 128, bias=True)
        self.fc2 = nn.Linear(128, 1, bias=True)

        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(128)


    def forward(self, x):
        (_, time_len, mel_bins) = x.shape

        x = x.view(-1, 1, time_len, mel_bins)
        # print('Input')
        # print(x.size())

        x = nnF.relu(self.bn1(self.conv1(x)))
        # print('Conv1')
        # print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(2,4),padding=(0,2))
        # print('Pool1')
        # print(x.size())
        
        x = nnF.relu(self.bn2(self.conv2(x)))
        # print('Conv2')
        # print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(3,3),padding=(1,1))
        # print('Pool2')
        # print(x.size())
        
        x = nnF.relu(self.bn3(self.conv3(x)))
        # print('Conv3')
        # print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(5,4))
        # print('Pool3')
        # print(x.size())
        
        # flatten
        x = x.view(-1, self.num_flat_features(x))
        x = nnF.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))

        return x

    def forward_and_convert(self, x):
        "Handles the torch<--->numpy tensor conversion, for convenience"
        x_torch = torch.FloatTensor(x)
        y_torch = self.forward(x_torch)
        return y_torch.detach().numpy()
        
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [9]:
trigger_word_detection_model = DetectionModel()
trigger_word_detection_model.load_state_dict(torch.load(".\\trigger\\simple_approach\\best_model"))

<All keys matched successfully>

## Define Audio preprocessing for both models

### 1.word and gender detector model preprocessing

In [10]:
freq = 48000
sample_rate = 48000
duration = 0.9999583333333333
max_samples=47998
kPRE_EMPHASIS_COEFF = 0.97
def pad_signal(signal, target_len):
    
    num_zeros_needed = target_len - len(signal)
    
    if num_zeros_needed > 0:

        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(signal, (num_zeros_front, num_zeros_back), mode='constant')

    else:
        return signal
    
def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - kPRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

def pipeline(signal):
    
    emphasized_signal = pre_emphasis(signal)
    # the following code applies dft, mel filter banks, logging, dct and normalization all at once
    # truly convenient
    lifted_mfcc = librosa.feature.mfcc(
        y=emphasized_signal.astype(float), 
        sr=sample_rate, 
        n_mfcc=12, 
        dct_type=2, 
        norm='ortho', 
        lifter=22,
        n_fft = int(sample_rate * 0.025),
        hop_length= int(sample_rate * 0.01),
        power=2,
        center=False,
        window='hanning',
        n_mels=40
    )

    return lifted_mfcc

def bytescale(data, cmin=None, cmax=None, high=255, low=0):
    """
    Byte scales an array (image).

    Byte scaling means converting the input image to uint8 dtype and scaling
    the range to ``(low, high)`` (default 0-255).
    If the input image already has dtype uint8, no scaling is done.

    Parameters
    ----------
    data : ndarray
        PIL image data array.
    cmin : scalar, optional
        Bias scaling of small values. Default is ``data.min()``.
    cmax : scalar, optional
        Bias scaling of large values. Default is ``data.max()``.
    high : scalar, optional
        Scale max value to `high`.  Default is 255.
    low : scalar, optional
        Scale min value to `low`.  Default is 0.

    Returns
    -------
    img_array : uint8 ndarray
        The byte-scaled array.

    Examples
    --------
    >>> img = array([[ 91.06794177,   3.39058326,  84.4221549 ],
                     [ 73.88003259,  80.91433048,   4.88878881],
                     [ 51.53875334,  34.45808177,  27.5873488 ]])
    >>> bytescale(img)
    array([[255,   0, 236],
           [205, 225,   4],
           [140,  90,  70]], dtype=uint8)
    >>> bytescale(img, high=200, low=100)
    array([[200, 100, 192],
           [180, 188, 102],
           [155, 135, 128]], dtype=uint8)
    >>> bytescale(img, cmin=0, cmax=255)
    array([[91,  3, 84],
           [74, 81,  5],
           [52, 34, 28]], dtype=uint8)

    """
    if data.dtype == np.uint8:
        return data

    if high < low:
        raise ValueError("`high` should be larger than `low`.")

    if cmin is None:
        cmin = data.min()
    if cmax is None:
        cmax = data.max()

    cscale = cmax - cmin
    if cscale < 0:
        raise ValueError("`cmax` should be larger than `cmin`.")
    elif cscale == 0:
        cscale = 1

    scale = float(high - low) / cscale
    bytedata = (data * 1.0 - cmin) * scale + 0.4999
    bytedata[bytedata > high] = high
    bytedata[bytedata < 0] = 0
    return np.cast[np.uint8](bytedata) + np.cast[np.uint8](low)

In [11]:
def preprocess_audio(audio):
    audio = pad_signal(audio, target_len=48000)
    mfc = pipeline(audio)
    mfc_3d = resize(np.rollaxis(np.array([mfc] * 3), 0, 3), (224, 224, 3))
    mfc_3d=bytescale(mfc_3d,cmin=0,cmax=255)
    return mfc_3d

### 2. trigger word detector model preprocessing

In [12]:
def extract_melspectrogram(audio,sr=22050, win_len=0.05, hop_len=0.025, n_mels=64):
#     audio, sr = librosa.load("{}".format(filename), sr=22050)
    win_len = int(win_len*sr)
    hop_len = int(hop_len*sr)
    spec = librosa.feature.melspectrogram(audio, sr, n_mels=n_mels, n_fft=2048, win_length=win_len, hop_length=hop_len)
    return spec.transpose((1,0))

## Load case study datasets

In [13]:
audio_MNIST_male_path = ".\\dataset\\gender_recognition_data\\male\\"
audio_MNIST_female_path = ".\\dataset\\gender_recognition_data\\female\\"
speech_command_dataset_path = ".\\dataset\\speech_command_dataset\\"

In [14]:
male_number_audio_path = os.listdir(audio_MNIST_male_path)
female_number_audio_path = os.listdir(audio_MNIST_female_path)
speech_command_audio_path = os.listdir(speech_command_dataset_path)
# make path varibales point to the full path
male_number_audio_path = list(map(lambda male_path: audio_MNIST_male_path+male_path , male_number_audio_path))
female_number_audio_path = list(map(lambda female_path: audio_MNIST_female_path+female_path , female_number_audio_path))
speech_command_audio_path = list(map(lambda command_path: speech_command_dataset_path+command_path , speech_command_audio_path))

In [15]:
options = ["male saying a number","female saying a number","speech command dataset"]
audio_dict = {"male saying a number":male_number_audio_path,"female saying a number":female_number_audio_path,"speech command dataset":speech_command_audio_path}

In [16]:
# helper function to play audio
chunk = 1024  
def play_audio(audio_path):
    #open a wav format music  
    f = wave.open(audio_path,"rb") 
    p = pyaudio.PyAudio()  
    #open stream  
    stream = p.open(format = p.get_format_from_width(f.getsampwidth()),  
                    channels = f.getnchannels(),  
                    rate = f.getframerate(),  
                    output = True)  
    #read data  
    data = f.readframes(chunk)  

    #play stream  
    while data:  
        stream.write(data)  
        data = f.readframes(chunk)  

    #stop stream  
    stream.stop_stream()  
    stream.close()  

    #close PyAudio  
    p.terminate()  
    


In [22]:
# wait for activation word
instructions = True
while True:
    if instructions:
        print("I am listening, say the word 'on' to activate.")
        instructions= False
    is_triggered = False
    recording = sd.rec(int(1 * 22050), 
                   samplerate=22050, channels=1)
  
    # Record audio for the given number of seconds
    sd.wait()
    spectogram = extract_melspectrogram(recording[:,0])
    with torch.no_grad():
        pred = trigger_word_detection_model(torch.FloatTensor(spectogram[None,:]))
    if pred >=0.9:
        print()
        print("Trigger word detected !!")
        is_triggered = True
        print()
    else:
        print(".",end=" ")
    if is_triggered:
        # main loop
        for _ in range(5):
            random_choice_idx = random.randint(0, 1)
            choice = options[random_choice_idx]
            candidate_audio = audio_dict[choice]
            chosen_audio_path_idx = random.randint(0, len(candidate_audio)-1)
            chosen_audio_path = candidate_audio[chosen_audio_path_idx]
            print("---------------------------------------------------")
            time.sleep(2)#
            play_audio(chosen_audio_path)
            # model inference
            sr, signal = scipy.io.wavfile.read(chosen_audio_path)  # faster than librosa
            mfc_3d = preprocess_audio(signal)
            mfc_3d = torch.tensor(mfc_3d.reshape((3,224,224)),dtype = torch.float32)
            with torch.no_grad():
                pred = gender_number_recognition_model(mfc_3d[None,:])

            if random_choice_idx==2:
                spoken_number="Not a number"
            else:
                spoken_number = np.argmax(pred[0]).item()
            if pred[1]>=0.5:
                gender = "Female"
                gender_pred_acc = pred[1].item()*100
            else:
                gender = "Male"
                gender_pred_acc = (1-pred[1]).item()*100
            print("number : ",spoken_number)
            print("gender : {}  ".format(gender))
            instructions = True
        time.sleep(4)
        clear_output(wait=True)

I am listening, say the word 'on' to activate.
. . 
Trigger word detected !!

---------------------------------------------------
number :  3
gender : Female  
---------------------------------------------------


KeyboardInterrupt: 

In [27]:

for _ in range(10):
    random_choice_idx = 2
    choice = options[random_choice_idx]
    candidate_audio = audio_dict[choice]
    chosen_audio_path_idx = random.randint(0, len(candidate_audio)-1)
    chosen_audio_path = candidate_audio[chosen_audio_path_idx]
    print("---------------------------------------------------")
    time.sleep(2)#
    play_audio(chosen_audio_path)
    # model inference
    sr, signal = scipy.io.wavfile.read(chosen_audio_path)  # faster than librosa
    mfc_3d = preprocess_audio(signal)
    mfc_3d = torch.tensor(mfc_3d.reshape((3,224,224)),dtype = torch.float32)
    with torch.no_grad():
        pred = gender_number_recognition_model(mfc_3d[None,:])

    if random_choice_idx==2:
        spoken_number="Not a number"
    else:
        spoken_number = np.argmax(pred[0]).item()
    if pred[1]>=0.5:
        gender = "Female"
        gender_pred_acc = pred[1].item()*100
    else:
        gender = "Male"
        gender_pred_acc = (1-pred[1]).item()*100
    print("gender : {}  ".format(gender))
    instructions = True
time.sleep(4)
clear_output(wait=True)

---------------------------------------------------
gender : Female  
---------------------------------------------------
gender : Male  
---------------------------------------------------
gender : Female  
---------------------------------------------------
gender : Male  
---------------------------------------------------
gender : Female  
---------------------------------------------------
gender : Male  
---------------------------------------------------
gender : Male  
---------------------------------------------------
gender : Male  
---------------------------------------------------
gender : Female  
---------------------------------------------------
gender : Male  
