In [1]:
import numpy as np
import os
import pandas as pd
import deepspeech
import sox
import sys
import wave
import logging
import pydub 
from pydub.playback import play
import time

In [3]:
from IPython.display import clear_output

In [2]:
# TODO(MMAZ) lower() and remove punctuation, etc
# pasted from https://stackoverflow.com/a/32558749
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [4]:
model = deepspeech.Model("../deepspeech/deepspeech-0.7.1-models.pbmm")

In [5]:
def deepspeech_inference(
    dataset_ix, df, clips_path, model, tmp_wav_filepath="./tmp/tmp.wav", rate=16000
):
    mp3_path = clips_path + df.iloc[dataset_ix].path
    # convert to wav (will overwrite)
    transformer = sox.Transformer()
    transformer.convert(samplerate=rate)
    transformer.build(mp3_path, tmp_wav_filepath)
    wav = wave.open(tmp_wav_filepath, "rb")
    audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
    # choose the likeliest inference
    return mp3_path, model.sttWithMetadata(audio, 1)


def metadata_to_string(metadata):
    return "".join(token.text for token in metadata.tokens)


def listen_to_split_on_boundaries(inference, clip, extent_ms=0):
    tokens = inference.transcripts[0].tokens
    spaces = [ix for ix, token in enumerate(tokens) if token.text == " "]
    word_boundaries = [0] + list(map(lambda ix: ix + 1, spaces))
    if word_boundaries[-1] > len(tokens):
        logging.warning("deepspeech.stt returned an inference ending in a space")
        word_boundaries = word_boundaries[:-1]

    text = metadata_to_string(inference.transcripts[0])

    # everything but the last word
    # TODO(MMAZ) look for bugs/can this be improved?
    ix = 0
    for start_token, end_token in zip(word_boundaries, spaces):
        print("word", ix, ": ", text[start_token:end_token])
        ix += 1
        # include the space before the word boundary
        start_token = np.maximum(0, start_token - 1)
        start_ms = tokens[start_token].start_time * 1000 - extent_ms
        # don't include audio before the start
        start_ms = np.maximum(0, start_ms)
        end_ms = tokens[end_token].start_time * 1000 + extent_ms
        play(clip[start_ms:end_ms])
        time.sleep(1)

    # the last word
    print("word", ix, ": ", text[end_token:])
    start_ms = tokens[word_boundaries[-1]].start_time * 1000 - extent_ms
    start_ms = np.maximum(0, start_ms)
    play(clip[start_ms:])


def listen_to_sample(df, ix, clips_path, model):
    print("Sample Index:", ix)
    gt = df.iloc[ix].sentence
    print("Groundtruth:", gt)
    gt_wc = gt.count(" ") + 1
    print("Groundtruth # words:", gt_wc)

    mp3_path, inference = deepspeech_inference(ix, df, clips_path, model)
    text = metadata_to_string(inference.transcripts[0])
    print("Inference:", text)
    text_wc = text.count(" ") + 1
    print("Inference # words", text_wc)
    print("Levenshtein Distance:", levenshteinDistance(gt, text))
    clip = pydub.AudioSegment.from_mp3(mp3_path)
    play(clip)
    return inference, clip


In [6]:
clips_path = "../mozilla_common_voice/clips/" # english corpus
df = pd.read_csv("../mozilla_common_voice/train.tsv", sep="\t")
print("number of mp3s in training set: ", df.shape[0])

number of mp3s in training set:  232975


Listen to an example clip:

In [7]:
ix = 194633 #3829
print(ix, '---', df.iloc[ix].sentence)
mp3_path = clips_path + df.iloc[ix].path
pydub.AudioSegment.from_mp3(mp3_path)

194633 --- All work was done by robots.


In [None]:
for example in range(1):
    print("Example", example)
    sample = np.random.randint(df.shape[0])
    inference, clip = listen_to_sample(df=df, ix=sample, clips_path=clips_path, model=model)
    listen_to_split_on_boundaries(inference, clip)
    print(inference)
    time.sleep(1)
    clear_output(wait=True)

## Extracting keywords for the micro dataset

In [6]:
def find_keyword(inference, keyword):
    """returns either (start,end) in seconds, or None"""
    tokens = inference.transcripts[0].tokens
    spaces = [ix for ix, token in enumerate(tokens) if token.text == " "]
    word_boundaries = [0] + list(map(lambda ix: ix + 1, spaces))
    if word_boundaries[-1] > len(tokens):
        # this happens occasionally (when testing with 0.7.0)
        #logging.warning("deepspeech.stt returned an inference ending in a space")
        word_boundaries = word_boundaries[:-1]

    text = metadata_to_string(inference.transcripts[0])
    
    spaces = spaces + [len(tokens)]
    for start_token, end_token in zip(word_boundaries, spaces):
        # note: end_token actually indexes the space after the current word
        word = text[start_token:end_token]
        
        # TODO(MMAZ) note this only extracts the first word for now
        # i.e., "tap up, then up, then down" will only return the first "up"
        if word == keyword:
            # include the space before the word boundary:
            start_token = np.maximum(0, start_token - 1)
            # and the space after the word boundary:
            end_token = np.minimum(len(tokens) - 1, end_token)

            return tokens[start_token].start_time, tokens[end_token].start_time
    return None

In [7]:
parent_dir = "eleven_word_dataset"

log_location = f"{parent_dir}/logs/deepspeech.log"
logging.basicConfig(
    level=logging.INFO, 
    format='[%(asctime)s ::: %(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler(filename=log_location),
        logging.StreamHandler(sys.stdout)
    ]
)

In [8]:
def extract_one_second(duration_s: float, start_s: float, end_s: float):
    """
    return one second around the midpoint between start_s and end_s
    """
    if duration_s < 1:
        return (0, duration_s)

    center_s = start_s + ((end_s - start_s) / 2.0)

    new_start_s = center_s - 0.5
    new_end_s = center_s + 0.5

    if new_end_s > duration_s:
        new_end_s = duration_s
        new_start_s = duration_s - 1.0

    if new_start_s < 0:
        new_start_s = 0
        new_end_s = np.minimum(duration_s, new_start_s + 1.0)

#     print(
#         "start",
#         new_start_s,
#         "end",
#         new_end_s,
#         "\nduration",
#         new_end_s - new_start_s,
#         "midpoint",
#         new_start_s + ((new_end_s - new_start_s) / 2.0),
#     )
    return (new_start_s, new_end_s)

In [9]:
#keywords_set = ["up", "down", "three", "yes", "no", "left", "right", "on", "off", "stop", "go"]
# processed up/down/three in an earlier batch
keywords_set = ["yes", "no", "left", "right", "on", "off", "stop", "go"]

for keyword in keywords_set:
    # keep track of current keyword in separate file
    with open("./tmp/current.txt", 'a') as fh:
        fh.write(f"{keyword}\n")
        
    df = pd.read_csv(f"./{parent_dir}/{keyword}.tsv", sep="\t")
    logging.info(f"{keyword}.tsv # of examples: {df.shape[0]}")

    tmp_wav_filepath = "./tmp/tmp.wav"

    for ix, (_, r) in enumerate(df.iterrows()):
        if ix % 100 == 0:
            with open("./tmp/current.txt", 'a') as fh:
                fh.write(f"{ix}\n")
        mp3_path = f"./{parent_dir}/clips/{keyword}/" + r.path

        # convert to wav (will overwrite)
        transformer = sox.Transformer()
        transformer.convert(samplerate=16000)
        if os.path.exists(tmp_wav_filepath):
            os.remove(tmp_wav_filepath)
        transformer.build(mp3_path, tmp_wav_filepath)
        wav = wave.open(tmp_wav_filepath, "rb")
        audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)

        # choose the likeliest inference
        inference = model.sttWithMetadata(audio, 1)

        text = metadata_to_string(inference.transcripts[0])
        result = find_keyword(inference, keyword)
        if result is None:
            logging.warning(f"keyword {keyword} not found in inference for sentence {ix}:\n - {text}")
            continue

        start_s, end_s = result
        if end_s - start_s > 1:
            logging.warn(f"clip ix: {ix} timestamps are longer than one second")
        duration = sox.file_info.duration(tmp_wav_filepath)
        if duration < 1:
            logging.warning(f"clip ix {ix} shorter than 1s")

        start_s, end_s = extract_one_second(duration, start_s, end_s)

        # to listen:
        #clip = pydub.AudioSegment.from_wav(tmp_wav_filepath)
        #play(clip)
        #extraction = clip[start_s * 1000:end_s * 1000]
        #play(extraction)

        transformer = sox.Transformer()
        transformer.trim(start_s, end_s)
        transformer.fade(fade_in_len=0.1, fade_out_len=0.1)
        dest = f"./{parent_dir}/extractions_deepspeech/{keyword}/{r.path}.wav"
        transformer.build(tmp_wav_filepath, dest)
        # run 'soxi' on the output wav to inspect encoding parameters

        # listen to the processed audio:
        #play(pydub.AudioSegment.from_wav(dest))

[2020-06-01 15:53:21,379 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/left/common_voice_en_39141.mp3.wav trim 1.040000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 15:53:21,390 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/left/common_voice_en_39141.mp3.wav with effects: trim fade
[2020-06-01 15:53:21,391 ::: INFO] [SoX] 
[2020-06-01 15:53:21,392 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/left/common_voice_en_17921986.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 15:53:21,408 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 15:53:21,409 ::: INFO] [SoX] 
 - i fel a shop painted by lext arm
[2020-06-01 15:53:22,887 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/left/common_voice_en_180179.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 15:53:22,903 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 15:53:22,904 ::: INFO] [SoX] 
[2020-06-01

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 15:54:06,021 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/left/common_voice_en_195164.mp3.wav trim 1.790000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 15:54:06,032 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/left/common_voice_en_195164.mp3.wav with effects: trim fade
[2020-06-01 15:54:06,033 ::: INFO] [SoX] 
[2020-06-01 15:54:06,034 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/left/common_voice_en_14827.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 15:54:06,050 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 15:54:06,051 ::: INFO] [SoX] 
[2020-06-01 15:54:07,778 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/left/common_voice_en_14827.mp3.wav trim 1.640000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 15:54:07,789 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/left/com

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:06:26,256 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_131673.mp3.wav trim 1.860000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:06:26,267 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_131673.mp3.wav with effects: trim fade
[2020-06-01 16:06:26,268 ::: INFO] [SoX] 
[2020-06-01 16:06:26,269 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_481726.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:06:26,288 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:06:26,289 ::: INFO] [SoX] 
[2020-06-01 16:06:28,372 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_481726.mp3.wav trim 1.550000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:06:28,383 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/ri

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:10:27,139 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_2925.mp3.wav trim 1.390000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:10:27,150 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_2925.mp3.wav with effects: trim fade
[2020-06-01 16:10:27,151 ::: INFO] [SoX] 
[2020-06-01 16:10:27,152 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_100858.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:10:27,168 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:10:27,169 ::: INFO] [SoX] 
 - i'm sure shel be alright
[2020-06-01 16:10:28,541 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_33207.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:10:28,564 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:10:28,565 ::: INFO] [SoX] 
[2020-06-01 16:10:31

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:21:00,020 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_504503.mp3.wav trim 4.064000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:21:00,031 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_504503.mp3.wav with effects: trim fade
[2020-06-01 16:21:00,032 ::: INFO] [SoX] 
[2020-06-01 16:21:00,033 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_19248985.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:21:00,048 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:21:00,049 ::: INFO] [SoX] 
 - as avonmaking tha idicitions
[2020-06-01 16:21:01,118 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_522943.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:21:01,134 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:21:01,135 ::: INFO] [SoX] 
[2020-06-

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:29:42,983 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_15905058.mp3.wav trim 1.880000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:29:42,993 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/right/common_voice_en_15905058.mp3.wav with effects: trim fade
[2020-06-01 16:29:42,994 ::: INFO] [SoX] 
[2020-06-01 16:29:42,996 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_19501957.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:29:43,019 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:29:43,020 ::: INFO] [SoX] 
 - however unstandard appication forigms tuents arcubanyaption to live the thriht
[2020-06-01 16:29:46,147 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/right/common_voice_en_579434.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:29:46,168 ::: INFO] Created ./tmp/tmp.wav with effects: ra

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:56:39,004 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18147648.mp3.wav trim 1.360000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:56:39,015 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18147648.mp3.wav with effects: trim fade
[2020-06-01 16:56:39,016 ::: INFO] [SoX] 
[2020-06-01 16:56:39,017 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_18441548.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:56:39,036 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:56:39,037 ::: INFO] [SoX] 
[2020-06-01 16:56:41,514 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18441548.mp3.wav trim 3.580000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:56:41,525 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/com

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 16:57:46,924 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_532591.mp3.wav trim 2.420000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:57:46,935 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_532591.mp3.wav with effects: trim fade
[2020-06-01 16:57:46,936 ::: INFO] [SoX] 
[2020-06-01 16:57:46,937 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_688051.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 16:57:46,958 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 16:57:46,958 ::: INFO] [SoX] 
[2020-06-01 16:57:49,519 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_688051.mp3.wav trim 4.680000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 16:57:49,529 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voic

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 17:25:42,543 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18109079.mp3.wav trim 2.530000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 17:25:42,554 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18109079.mp3.wav with effects: trim fade
[2020-06-01 17:25:42,555 ::: INFO] [SoX] 
[2020-06-01 17:25:42,557 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_256347.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 17:25:42,574 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 17:25:42,575 ::: INFO] [SoX] 
[2020-06-01 17:25:44,380 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_256347.mp3.wav trim 2.410000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 17:25:44,390 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 17:39:32,876 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_19205173.mp3.wav trim 1.220000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 17:39:32,887 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_19205173.mp3.wav with effects: trim fade
[2020-06-01 17:39:32,888 ::: INFO] [SoX] 
[2020-06-01 17:39:32,889 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_54910.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 17:39:32,910 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 17:39:32,911 ::: INFO] [SoX] 
 - the strange horsemandry any normous corof stored from the scavberd monte of his sidel
[2020-06-01 17:39:35,712 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_681302.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 17:39:35,733 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


 - what emperse are youren binkbook
[2020-06-01 17:41:09,069 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_18859806.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 17:41:09,092 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 17:41:09,093 ::: INFO] [SoX] 
[2020-06-01 17:41:12,572 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18859806.mp3.wav trim 2.380000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 17:41:12,582 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/on/common_voice_en_18859806.mp3.wav with effects: trim fade
[2020-06-01 17:41:12,583 ::: INFO] [SoX] 
[2020-06-01 17:41:12,585 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/on/common_voice_en_591867.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 17:41:12,600 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 17:41:12,601 ::: INFO] [SoX] 
[2020-06-01 1

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 18:18:38,882 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_17979849.mp3.wav trim 2.240000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 18:18:38,893 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_17979849.mp3.wav with effects: trim fade
[2020-06-01 18:18:38,894 ::: INFO] [SoX] 
[2020-06-01 18:18:38,896 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/off/common_voice_en_684651.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 18:18:38,912 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 18:18:38,913 ::: INFO] [SoX] 
[2020-06-01 18:18:40,355 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_684651.mp3.wav trim 1.050000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 18:18:40,366 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/off/co

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 18:31:52,123 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_609032.mp3.wav trim 2.420000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 18:31:52,134 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_609032.mp3.wav with effects: trim fade
[2020-06-01 18:31:52,135 ::: INFO] [SoX] 
[2020-06-01 18:31:52,136 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/off/common_voice_en_657306.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 18:31:52,159 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 18:31:52,160 ::: INFO] [SoX] 
 - the large pieace of likingmof had selld a loud sharp dors
[2020-06-01 18:31:55,300 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/off/common_voice_en_595307.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 18:31:55,316 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 18:31:55,317 ::: INF

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 18:44:35,737 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_680125.mp3.wav trim 1.730000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 18:44:35,748 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/off/common_voice_en_680125.mp3.wav with effects: trim fade
[2020-06-01 18:44:35,749 ::: INFO] [SoX] 
[2020-06-01 18:44:35,750 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/off/common_voice_en_17736847.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 18:44:35,770 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 18:44:35,771 ::: INFO] [SoX] 
 - fy am robber in jewarcle your words pousaff me an stick you
[2020-06-01 18:44:38,340 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/off/common_voice_en_20150391.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 18:44:38,363 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 18:44:38,364 :

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


 - some calling me that
[2020-06-01 19:35:25,501 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_300816.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:35:25,518 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:35:25,519 ::: INFO] [SoX] 
 - comon just stopped thinking about it
[2020-06-01 19:35:27,195 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_17776133.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:35:27,211 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:35:27,212 ::: INFO] [SoX] 
[2020-06-01 19:35:28,772 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17776133.mp3.wav trim 1.370000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 19:35:28,783 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17776133.mp3.wav with effects: trim fade
[2020-06-01 19:35:2

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 19:41:34,862 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_534617.mp3.wav trim 5.130000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 19:41:34,872 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_534617.mp3.wav with effects: trim fade
[2020-06-01 19:41:34,873 ::: INFO] [SoX] 
[2020-06-01 19:41:34,874 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_27385.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:41:34,891 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:41:34,892 ::: INFO] [SoX] 
 - never stoc riving the old king had said
[2020-06-01 19:41:36,494 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_17881700.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:41:36,510 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:41:36,511 ::: INFO] [SoX] 
[20

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


 - sat getting takeaways the start to cook yourself
[2020-06-01 19:51:52,287 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_169510.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:51:52,308 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:51:52,309 ::: INFO] [SoX] 
[2020-06-01 19:51:55,004 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_169510.mp3.wav trim 5.144000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 19:51:55,015 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_169510.mp3.wav with effects: trim fade
[2020-06-01 19:51:55,016 ::: INFO] [SoX] 
[2020-06-01 19:51:55,017 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_568456.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 19:51:55,032 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 19:51:55,033 ::: INFO] [S

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 20:03:45,145 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17366693.mp3.wav trim 1.080000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 20:03:45,156 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17366693.mp3.wav with effects: trim fade
[2020-06-01 20:03:45,156 ::: INFO] [SoX] 
[2020-06-01 20:03:45,158 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_624870.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 20:03:45,171 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 20:03:45,172 ::: INFO] [SoX] 
[2020-06-01 20:03:45,759 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_624870.mp3.wav trim 0.392000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 20:03:45,770 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/st

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 20:29:32,731 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17246737.mp3.wav trim 4.390000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 20:29:32,742 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_17246737.mp3.wav with effects: trim fade
[2020-06-01 20:29:32,742 ::: INFO] [SoX] 
[2020-06-01 20:29:32,744 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_411155.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 20:29:32,762 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 20:29:32,763 ::: INFO] [SoX] 
 - stoppinstair at the heart working man
[2020-06-01 20:29:34,739 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_17260061.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 20:29:34,761 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 20:29:34,762 ::: INFO] [SoX] 


  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 20:29:48,825 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_654484.mp3.wav trim 6.220000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 20:29:48,835 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/stop/common_voice_en_654484.mp3.wav with effects: trim fade
[2020-06-01 20:29:48,836 ::: INFO] [SoX] 
[2020-06-01 20:29:48,838 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_579214.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 20:29:48,853 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 20:29:48,854 ::: INFO] [SoX] 
 - a buth heateden stoppe
[2020-06-01 20:29:50,223 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/stop/common_voice_en_19204155.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 20:29:50,242 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 20:29:50,243 ::: INFO] [SoX] 
[2020-06-01 20:29:5

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


 - think uglholm ind see what the families doing
[2020-06-01 21:07:20,149 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/go/common_voice_en_17272059.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 21:07:20,164 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 21:07:20,164 ::: INFO] [SoX] 
 - he sa come easy ger
[2020-06-01 21:07:21,277 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/go/common_voice_en_66945.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 21:07:21,292 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 21:07:21,293 ::: INFO] [SoX] 
[2020-06-01 21:07:22,378 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_66945.mp3.wav trim 1.230000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 21:07:22,389 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_66945.mp3.wav with effects: trim fade
[2020-06-01 21:07:22,390 :

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 21:12:26,988 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_539763.mp3.wav trim 1.440000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 21:12:26,999 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_539763.mp3.wav with effects: trim fade
[2020-06-01 21:12:26,999 ::: INFO] [SoX] 
[2020-06-01 21:12:27,001 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/go/common_voice_en_9105.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 21:12:27,017 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 21:12:27,018 ::: INFO] [SoX] 
[2020-06-01 21:12:28,598 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_9105.mp3.wav trim 2.552000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 21:12:28,609 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en

  logging.warn(f"clip ix: {ix} timestamps are longer than one second")


[2020-06-01 21:22:58,150 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_22722.mp3.wav trim 0.640000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 21:22:58,161 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_22722.mp3.wav with effects: trim fade
[2020-06-01 21:22:58,161 ::: INFO] [SoX] 
[2020-06-01 21:22:58,163 ::: INFO] Executing: sox -D -V2 ./eleven_word_dataset/clips/go/common_voice_en_81341.mp3 ./tmp/tmp.wav rate -h 16000.000000
[2020-06-01 21:22:58,179 ::: INFO] Created ./tmp/tmp.wav with effects: rate
[2020-06-01 21:22:58,179 ::: INFO] [SoX] 
[2020-06-01 21:22:59,521 ::: INFO] Executing: sox -D -V2 ./tmp/tmp.wav ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en_81341.mp3.wav trim 1.890000 1.000000 fade q 0.100000 reverse fade q 0.100000 reverse
[2020-06-01 21:22:59,533 ::: INFO] Created ./eleven_word_dataset/extractions_deepspeech/go/common_voice_en