Install dependencies and the multilingual_kws module

In [None]:
!apt-get -qq install -y sox ffmpeg && pip install --quiet sox
!git clone --quiet https://github.com/harvard-edge/multilingual_kws
import sys
sys.path.append("/content/multilingual_kws/")
import pickle

Import tools for streaming keyword search:

In [None]:
from multilingual_kws.embedding import batch_streaming_analysis as sa

Upload an MP3 of a streaming audio file:

In [None]:
from google.colab import files
uploaded = files.upload()

Saving mask_stream.mp3 to mask_stream.mp3


Convert MP3 to WAV:

In [None]:
!ffmpeg -hide_banner -loglevel error -y -i mask_stream.mp3 -acodec pcm_s16le -ac 1 -ar 16000 mask_stream.wav

Upload and unzip KWS model:

In [None]:
!unzip -q masiki_model.zip

We don't have groundtruth knowledge of when keywords occur in the streaming audio file, so provide an empty file for groundtruth information:

In [None]:
!touch empty.txt

Configure the KWS model with
* detection threshold
* wav file location
* Keyword to use as a label
* Filename to save results to

In [None]:
target_word = "masiki"
flags = sa.StreamFlags(
    wav="/content/mask_stream.wav",
    ground_truth="/content/empty.txt",
    target_keyword=target_word,
    detection_thresholds=[0.9],
    average_window_duration_ms=100,
    suppression_ms=500,
    time_tolerance_ms=750, #only used when graphing
)
streamtarget = sa.StreamTarget(
    target_lang="luganda",
    target_word=target_word,
    model_path="/content/xfer_epochs_4_bs_64_nbs_2_val_acc_1.00_target_mask/",
    destination_result_pkl="/content/results.pkl",
    destination_result_inferences="/content/inferences.npy",
    stream_flags=[flags],
)

Perform inference:

In [None]:
sa.eval_stream_test(streamtarget)

Inspect the inference results:

In [None]:
with open("/content/results.pkl", 'rb') as fh:
  results = pickle.load(fh)

In [None]:
detections = results["masiki"][0][1][0.9][0]

In [None]:
len(detections)

35

In [None]:
[d[1] for d in detections]

[9060,
 87680,
 145460,
 168460,
 193620,
 240960,
 244680,
 371240,
 408680,
 429280,
 432020,
 469280,
 471240,
 488600,
 557360,
 643200,
 714680,
 767480,
 799340,
 811720,
 855500,
 863180,
 865300,
 873320,
 924360,
 1030080,
 1106680,
 1199280,
 1200600,
 1228260,
 1243620,
 1310240,
 1321400,
 1323880,
 1386420]

In [None]:
import sox
import os
from pathlib import Path
extractions = Path("/content/extractions")
os.makedirs(extractions)
for ix, (_, time_ms) in enumerate(detections):
    dest_wav = str(
        extractions
        / f"{ix:03d}_{target_word}_detection_{time_ms}ms.wav"
    )
    print(dest_wav)
    time_s = time_ms / 1000.0

    transformer = sox.Transformer()
    transformer.convert(samplerate=16000)  
    transformer.trim(time_s - 1, time_s + 1)
    transformer.build("/content/mask_stream.wav", dest_wav)

/content/extractions/000_masiki_detection_9060ms.wav
/content/extractions/001_masiki_detection_87680ms.wav
/content/extractions/002_masiki_detection_145460ms.wav
/content/extractions/003_masiki_detection_168460ms.wav
/content/extractions/004_masiki_detection_193620ms.wav
/content/extractions/005_masiki_detection_240960ms.wav
/content/extractions/006_masiki_detection_244680ms.wav
/content/extractions/007_masiki_detection_371240ms.wav
/content/extractions/008_masiki_detection_408680ms.wav
/content/extractions/009_masiki_detection_429280ms.wav
/content/extractions/010_masiki_detection_432020ms.wav
/content/extractions/011_masiki_detection_469280ms.wav
/content/extractions/012_masiki_detection_471240ms.wav
/content/extractions/013_masiki_detection_488600ms.wav
/content/extractions/014_masiki_detection_557360ms.wav
/content/extractions/015_masiki_detection_643200ms.wav
/content/extractions/016_masiki_detection_714680ms.wav
/content/extractions/017_masiki_detection_767480ms.wav
/content/extr

In [None]:
import IPython.display as ipd
import numpy as np

In [None]:
# listen to random extraction
wavs = os.listdir(extractions)
w = extractions / np.random.choice(wavs)
ipd.Audio(filename=w)

In [None]:
!zip -r extractions.zip extractions

  adding: extractions/ (stored 0%)
  adding: extractions/000_masiki_detection_9060ms.wav (deflated 6%)
  adding: extractions/021_masiki_detection_863180ms.wav (deflated 9%)
  adding: extractions/018_masiki_detection_799340ms.wav (deflated 14%)
  adding: extractions/014_masiki_detection_557360ms.wav (deflated 6%)
  adding: extractions/023_masiki_detection_873320ms.wav (deflated 6%)
  adding: extractions/013_masiki_detection_488600ms.wav (deflated 7%)
  adding: extractions/010_masiki_detection_432020ms.wav (deflated 3%)
  adding: extractions/006_masiki_detection_244680ms.wav (deflated 8%)
  adding: extractions/019_masiki_detection_811720ms.wav (deflated 6%)
  adding: extractions/033_masiki_detection_1323880ms.wav (deflated 8%)
  adding: extractions/034_masiki_detection_1386420ms.wav (deflated 4%)
  adding: extractions/028_masiki_detection_1200600ms.wav (deflated 6%)
  adding: extractions/015_masiki_detection_643200ms.wav (deflated 4%)
  adding: extractions/017_masiki_detection_767480ms.w