# Keyword spotting with CLAP-IPA
This example demonstrates how to search for phonetic keywords in speech recordings.

## Load CLAP-IPA

CLAP-IPA has two components: 
1. The speech encoder for the recording you want to search. (E.g., `sample.wav`)
2. The phone encoder for the phone sequence you want to search for (E.g., [bo:do:])

In [None]:
import torch
from clap.encoders import SpeechEncoder, PhoneEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

speech_encoder = SpeechEncoder.from_pretrained('anyspeech/clap-ipa-tiny-speech')
phone_encoder = PhoneEncoder.from_pretrained('anyspeech/clap-ipa-tiny-phone')

phone_encoder.eval().to(device)
speech_encoder.eval().to(device)

# Load example recording
The transcript is something like:

['m a m o n a k ɯ b o ː d o ː t ɕ i ɴ a t s ɯ j o ː n o s o ː b i o t ɕ a k ɯ j o ː ɕ i t a k e ː k a ɴ t a t ɕ i ɡ a ɕ ɯ ː j o ː ɕ o n i h a i ɾ i s a i ɾ ɯ i ɡ a s ɯ d e d ʑ ɯ k e ː ɕ a o o i t s ɯ m e m a ɕ i t a']

In [75]:
import numpy as np
from IPython.display import Audio
import librosa

audio, rate = librosa.load('sample.wav', sr=16000)
Audio(audio, rate=rate)

## Specify the keyword 
We will search for the keyword "[bo:do:]"

In [78]:
from transformers import AutoProcessor, DebertaV2Tokenizer
# Encode the target phone sequence 
tokenizer = DebertaV2Tokenizer.from_pretrained('charsiu/IPATokenizer')
target_phones = "boːdoː"
transcript_tokens = tokenizer(target_phones, return_attention_mask=False, return_length=True, return_token_type_ids=False, add_special_tokens=False, return_tensors='pt')
del transcript_tokens['length'] # otherwise line below will raise an error about unexpected key
phone_embed = phone_encoder(**transcript_tokens)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.8022,  0.5485,  0.2948,  ...,  0.5279, -0.2149,  0.9901],
         [-1.1010,  0.5296,  0.4064,  ...,  0.2729,  0.0705,  1.2811],
         [-1.1074,  0.4180, -0.3678,  ...,  0.2447,  0.3261,  1.0456],
         [-0.3884, -0.3715, -0.4631,  ...,  0.7947,  0.2837,  0.6106],
         [-0.9363,  0.1004,  0.1066,  ...,  0.6031,  0.5088,  0.5038],
         [-1.2608,  0.5247, -0.1958,  ...,  0.5059,  0.4659,  0.6550]]],
       grad_fn=<ViewBackward0>), pooler_output=tensor([[-0.9327,  0.2916, -0.0365,  0.1230,  0.3921, -0.5966,  1.5406,  0.5430,
          1.1329, -0.1795, -0.6738,  0.9080, -0.3948, -0.0933, -0.2591, -1.1836,
          0.7762, -0.1050,  0.7797,  0.2967, -0.4467, -1.4112, -1.4448, -0.0995,
         -0.5192,  1.1349,  0.7155, -0.2648, -0.5750,  1.2773,  0.1291, -0.6377,
         -0.7608,  1.8740, -0.4377, -0.1438, -1.1061, -0.2840,  0.0855, -0.0306,
         -0.1570,  0.3947,  0.2253,  0.5078, -0.3367,  0.

## Slice up the recording 
We know the length of the phone sequence we're looking for is about 0.5 seconds. So we will slice up the recording into 0.5 second components

In [79]:
# Split into 0.5 second clips 
audio_clips = np.array([
    audio[i:i + 8000] for i in range(0, len(audio), 8000)  if len(audio[i:]) > 8000
])
Audio(audio_clips[5], rate=16000) # contains the substring '[bo:do:]'

In [80]:
audio_clips.shape # there are now 25 clips of 0.5 seconds each

(25, 8000)

In [81]:
# encode the speech
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained('openai/whisper-base')
audio_encoded = processor(audio_clips, sampling_rate = 16000, return_tensors="pt", return_attention_mask=True, padding=True)
speech_features = speech_encoder(**audio_encoded, return_dict=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [87]:
speech_features.pooler_output.shape # we have 25 vector representations, one for each audio clip.

torch.Size([25, 384])

In [88]:
import torch.nn.functional as F
import numpy as np
from librosa.sequence import dtw

def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

with torch.no_grad():
    similarities = [cos_sim(speech_features.pooler_output[i].cpu().numpy(), phone_embed.pooler_output.T.flatten().cpu().numpy()) for i in range(len(speech_features.pooler_output))]


In [92]:
print(similarities)
print("Most similar clip index:", np.argmax(similarities))
Audio(audio_clips[np.argmax(similarities)], rate=16000)  # Play the most similar clip, which will contain the target string [bo:do:]

[-0.02053195, -0.14307617, -0.1187448, -0.17794196, 0.25669512, 0.62581825, -0.1602566, 0.117031164, 0.12811708, -0.10994876, -0.08763444, -0.117799, -0.21920726, 0.09830929, -0.2230632, 0.032529112, -0.014139796, 0.05216112, -0.03391095, 0.033862904, 0.17658415, -0.08485473, -0.22108813, 0.09134472, 0.15925027]
Most similar clip index: 5
