In [5]:
import numpy as np
import os
import pandas as pd
import sox
import wave
import logging
import pydub 
from pydub.playback import play
import time
import csv
import glob
import shutil
import json
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from typing import Set, List, Dict
import functools

In [2]:
from IPython.display import clear_output

In [3]:
clips_path = "../mozilla_common_voice/clips/" # english corpus
df = pd.read_csv("../mozilla_common_voice/train.tsv", sep="\t")
print("number of mp3s in training set: ", df.shape[0])

number of mp3s in training set:  232975


### Generating a micro dataset of 50 clips

In [52]:
selected_clips=[]

In [44]:
def listen_to_sample(df, ix, clips_path):
    mp3_path = clips_path + df.iloc[ix].path
    clip = pydub.AudioSegment.from_mp3(mp3_path)
    play(clip)
    return clip

In [None]:
sample = np.random.randint(df.shape[0])
listen_to_sample(df=df, ix=sample, clips_path=clips_path)
selected_clips.append(sample)
print(len(selected_clips))

In [None]:
raise ValueError("warning: overwrites")
df.iloc[selected_clips].to_csv(
    "selected_clips.tsv", sep="\t", quoting=csv.QUOTE_MINIMAL, index=False
)

In [51]:
selected = pd.read_csv("selected_clips.tsv", sep="\t")

In [26]:
for _, r in selected.iterrows():
    mp3_path = clips_path + r.path
    shutil.copy2(mp3_path, "./micro_dataset")

Now, run `word_separator/transcribe_word_time_offsets.py` on `./micro_dataset`

## Select keywords for word separation

In [6]:
def clean_and_filter(keyword_set: Set[str], sentence: str) -> List[str]:
    tokens = nltk.word_tokenize(sentence)
    words = [word.lower() for word in tokens if word.isalpha()]
    return list(filter(lambda w: w in keyword_set, words))

In [9]:
keyword_set = set(["up", "no", "three"])

# TODO(MMAZ) inefficient
df['keywords'] = df.sentence.apply(functools.partial(clean_and_filter, keyword_set))

df.keywords.dropna(inplace=True)
usable = df.keywords.transform(len)
print("mp3s containing speechcommands keywords", usable[usable > 0].shape[0])

mp3s containing speechcommands keywords 8574


In [34]:
raise ValueError("warning: will overwrite")
usable = df.loc[df.keywords.transform(len) > 0]
usable.drop(['keywords'], axis=1).to_csv("keywords_train.tsv", sep="\t", quoting=csv.QUOTE_MINIMAL, index=False)

In [35]:
keywords = pd.read_csv("keywords_train.tsv", sep="\t")

In [103]:
sample = np.random.randint(keywords.shape[0])
print(keywords.iloc[sample].sentence)
listen_to_sample(df=keywords, ix=sample, clips_path=clips_path)

Soon, Alexander got fed up with the jumper and gave it back.


In [102]:
selected_clips.append(sample)
print(len(selected_clips))

25


In [104]:
raise ValueError("warning: overwrites")
keywords.iloc[selected_clips].to_csv(
    "selected_clips_yes_no_three.tsv", sep="\t", quoting=csv.QUOTE_MINIMAL, index=False
)

In [105]:
selected = pd.read_csv("selected_clips_yes_no_three.tsv", sep="\t")

In [111]:
for _,r in selected.iterrows():
    mp3_path = clips_path + r.path
    shutil.copy2(mp3_path, "./keywords_micro_dataset")

### Split word on boundaries

In [112]:
selected = pd.read_csv("selected_clips_yes_no_three.tsv", sep="\t")

In [113]:
# TODO(MMAZ) lower() and remove punctuation, etc
# pasted from https://stackoverflow.com/a/32558749
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [143]:
def listen_to_split_on_boundaries(timings, clip, keywords_set=None):
    if keywords_set is not None:
        words = set([t["word"] for t in timings])
        if words.isdisjoint(keywords_set):
            print("No keywords found in the transcription")
    for ix, timing in enumerate(timings):
        word = timing['word']
        if keywords_set and word not in keywords_set:
            continue
        start_ms = timing['start_time'] * 1000
        end_ms = timing['end_time'] * 1000
        print(f"word {ix}: {word} \n - start_time: {timing['start_time']:.3f} - end_time: {timing['end_time']:.3f}")
        play(clip[start_ms:end_ms])
        time.sleep(1)

### Selected keywords

In [136]:
keyword_set

{'no', 'three', 'up'}

In [150]:
for ix, row in selected.iterrows():
    print(f"Sample: {ix}")
    fname = "./keywords_transcriptions/" + row.path + ".json"
    with open(fname, 'r') as fh:
        transcription = json.load(fh)
    gt = row.sentence
    print(f"Groundtruth: {gt}")
    text = transcription['transcript']
    print(f"Inference: {text}")
    print("Edit Distance:", levenshteinDistance(gt, text))
    print("\n--\n")
    clip = listen_to_sample(df=selected, ix=ix, clips_path=clips_path)
    listen_to_split_on_boundaries(transcription['timings'], clip, keywords_set=keyword_set)
    time.sleep(4)
    clear_output()

Sample: 10
Groundtruth: He has a wife, Sherri, and three daughters.
Inference: he has a wife Sherry and three daughters
Edit Distance: 5

--

word 6: three 
 - start_time: 2.200 - end_time: 2.400
