In [1]:
import numpy as np
import os
import pandas as pd
import deepspeech
import sox
import sys
import wave
import logging
import pydub 
from pydub.playback import play
import time

In [2]:
from IPython.display import clear_output

In [3]:
# TODO(MMAZ) lower() and remove punctuation, etc
# pasted from https://stackoverflow.com/a/32558749
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [4]:
model = deepspeech.Model("../deepspeech/deepspeech-0.7.1-models.pbmm")

In [5]:
def deepspeech_inference(
    dataset_ix, df, clips_path, model, tmp_wav_filepath="./tmp/tmp.wav", rate=16000
):
    mp3_path = clips_path + df.iloc[dataset_ix].path
    # convert to wav (will overwrite)
    transformer = sox.Transformer()
    transformer.convert(samplerate=rate)
    transformer.build(mp3_path, tmp_wav_filepath)
    wav = wave.open(tmp_wav_filepath, "rb")
    audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
    # choose the likeliest inference
    return mp3_path, model.sttWithMetadata(audio, 1)


def metadata_to_string(metadata):
    return "".join(token.text for token in metadata.tokens)


def listen_to_split_on_boundaries(inference, clip, extent_ms=0):
    tokens = inference.transcripts[0].tokens
    spaces = [ix for ix, token in enumerate(tokens) if token.text == " "]
    word_boundaries = [0] + list(map(lambda ix: ix + 1, spaces))
    if word_boundaries[-1] > len(tokens):
        logging.warning("deepspeech.stt returned an inference ending in a space")
        word_boundaries = word_boundaries[:-1]

    text = metadata_to_string(inference.transcripts[0])

    # everything but the last word
    # TODO(MMAZ) look for bugs/can this be improved?
    ix = 0
    for start_token, end_token in zip(word_boundaries, spaces):
        print("word", ix, ": ", text[start_token:end_token])
        ix += 1
        # include the space before the word boundary
        start_token = np.maximum(0, start_token - 1)
        start_ms = tokens[start_token].start_time * 1000 - extent_ms
        # don't include audio before the start
        start_ms = np.maximum(0, start_ms)
        end_ms = tokens[end_token].start_time * 1000 + extent_ms
        play(clip[start_ms:end_ms])
        time.sleep(1)

    # the last word
    print("word", ix, ": ", text[end_token:])
    start_ms = tokens[word_boundaries[-1]].start_time * 1000 - extent_ms
    start_ms = np.maximum(0, start_ms)
    play(clip[start_ms:])


def listen_to_sample(df, ix, clips_path, model):
    print("Sample Index:", ix)
    gt = df.iloc[ix].sentence
    print("Groundtruth:", gt)
    gt_wc = gt.count(" ") + 1
    print("Groundtruth # words:", gt_wc)

    mp3_path, inference = deepspeech_inference(ix, df, clips_path, model)
    text = metadata_to_string(inference.transcripts[0])
    print("Inference:", text)
    text_wc = text.count(" ") + 1
    print("Inference # words", text_wc)
    print("Levenshtein Distance:", levenshteinDistance(gt, text))
    clip = pydub.AudioSegment.from_mp3(mp3_path)
    play(clip)
    return inference, clip


In [6]:
clips_path = "../mozilla_common_voice/clips/" # english corpus
df = pd.read_csv("../mozilla_common_voice/train.tsv", sep="\t")
print("number of mp3s in training set: ", df.shape[0])

number of mp3s in training set:  232975


Listen to an example clip:

In [7]:
ix = 194633 #3829
print(ix, '---', df.iloc[ix].sentence)
mp3_path = clips_path + df.iloc[ix].path
pydub.AudioSegment.from_mp3(mp3_path)

194633 --- All work was done by robots.


In [None]:
for example in range(1):
    print("Example", example)
    sample = np.random.randint(df.shape[0])
    inference, clip = listen_to_sample(df=df, ix=sample, clips_path=clips_path, model=model)
    listen_to_split_on_boundaries(inference, clip)
    print(inference)
    time.sleep(1)
    clear_output(wait=True)

## Extracting keywords for the micro dataset

In [6]:
def find_keyword(inference, keyword):
    """returns either (start,end) in seconds, or None"""
    tokens = inference.transcripts[0].tokens
    spaces = [ix for ix, token in enumerate(tokens) if token.text == " "]
    word_boundaries = [0] + list(map(lambda ix: ix + 1, spaces))
    if word_boundaries[-1] > len(tokens):
        # this happens occasionally (when testing with 0.7.0)
        #logging.warning("deepspeech.stt returned an inference ending in a space")
        word_boundaries = word_boundaries[:-1]

    text = metadata_to_string(inference.transcripts[0])
    
    spaces = spaces + [len(tokens)]
    for start_token, end_token in zip(word_boundaries, spaces):
        # note: end_token actually indexes the space after the current word
        word = text[start_token:end_token]
        
        # TODO(MMAZ) note this only extracts the first word for now
        # i.e., "tap up, then up, then down" will only return the first "up"
        if word == keyword:
            # include the space before the word boundary:
            start_token = np.maximum(0, start_token - 1)
            # and the space after the word boundary:
            end_token = np.minimum(len(tokens) - 1, end_token)

            return tokens[start_token].start_time, tokens[end_token].start_time
    return None

In [7]:
parent_dir = "two_word_dataset"

log_location = f"{parent_dir}/logs/deepspeech.log"
logging.basicConfig(
    level=logging.INFO, 
    format='[%(asctime)s ::: %(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler(filename=log_location),
        logging.StreamHandler(sys.stdout)
    ]
)

In [8]:
def extract_one_second(duration_s: float, start_s: float, end_s: float):
    """
    return one second around the midpoint between start_s and end_s
    """
    if duration_s < 1:
        return (0, duration_s)

    center_s = start_s + ((end_s - start_s) / 2.0)

    new_start_s = center_s - 0.5
    new_end_s = center_s + 0.5

    if new_end_s > duration_s:
        new_end_s = duration_s
        new_start_s = duration_s - 1.0

    if new_start_s < 0:
        new_start_s = 0
        new_end_s = np.minimum(duration_s, new_start_s + 1.0)

#     print(
#         "start",
#         new_start_s,
#         "end",
#         new_end_s,
#         "\nduration",
#         new_end_s - new_start_s,
#         "midpoint",
#         new_start_s + ((new_end_s - new_start_s) / 2.0),
#     )
    return (new_start_s, new_end_s)

In [None]:
keyword = "up"
df = pd.read_csv(f"./{parent_dir}/{keyword}.tsv", sep="\t")
logging.info(f"{keyword}.tsv # of examples: {df.shape[0]}")

tmp_wav_filepath = "./tmp/tmp.wav"

for ix, (_, r) in enumerate(df.iterrows()):
    if ix % 10 == 0:
        print("progress", ix)
    mp3_path = f"./{parent_dir}/clips/{keyword}/" + r.path
    
    # convert to wav (will overwrite)
    transformer = sox.Transformer()
    transformer.convert(samplerate=16000)
    if os.path.exists(tmp_wav_filepath):
        os.remove(tmp_wav_filepath)
    transformer.build(mp3_path, tmp_wav_filepath)
    wav = wave.open(tmp_wav_filepath, "rb")
    audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
    
    # choose the likeliest inference
    inference = model.sttWithMetadata(audio, 1)
    
    text = metadata_to_string(inference.transcripts[0])
    result = find_keyword(inference, keyword)
    if result is None:
        logging.warning(f"keyword {keyword} not found in inference for sentence {ix}:\n - {text}")
        continue
        
    start_s, end_s = result
    if end_s - start_s > 1:
        logging.warn(f"clip ix: {ix} timestamps are longer than one second")
    duration = sox.file_info.duration(tmp_wav_filepath)
    if duration < 1:
        logging.warn(f"clip ix {ix} shorter than 1s")
    
    start_s, end_s = extract_one_second(duration, start_s, end_s)
        
    # to listen:
    #clip = pydub.AudioSegment.from_wav(tmp_wav_filepath)
    #play(clip)
    #extraction = clip[start_s * 1000:end_s * 1000]
    #play(extraction)
    
    transformer = sox.Transformer()
    transformer.trim(start_s, end_s)
    transformer.fade(fade_in_len=0.1, fade_out_len=0.1)
    dest = f"./{parent_dir}/extractions_deepspeech/{keyword}/{r.path}.wav"
    transformer.build(tmp_wav_filepath, dest)
    # run 'soxi' on the output wav to inspect encoding parameters
    
    # listen to the processed audio:
    #play(pydub.AudioSegment.from_wav(dest))
    