In [1]:
import pandas as pd
import logging
import subprocess
import os
import shutil
import shlex
import glob
import textgrid
import sox

In [2]:
keywords_set = set(["up", "down", "three", "yes", "no", "left", "right", "on", "off", "stop", "go"])

In [3]:
keywords = pd.read_csv("../keywords_listen.tsv", sep="\t")
print(keywords.shape[0])

112008


In [4]:
# select examples each from keywords where the column ["keywords"] == True
NUM_SAMPLES = 100
samples = {}
for k in keywords_set:
    if keywords[k].value_counts().loc[True] > NUM_SAMPLES:
        #are there more than NUM_SAMPLES examples?
        samples[k] = keywords[keywords[k]].sample(n=NUM_SAMPLES)
    else:
        # use them all
        samples[k] = keywords[keywords[k]]
        logging.warning(f"for keyword {k}, there are not enough examples to sample")
    print("Keyword", k, ":", samples[k].shape[0])    
    
# clean up each df by removing the keyword columns
for k in samples.keys():
    samples[k].drop(columns=keywords_set, inplace=True)

Keyword up : 100
Keyword three : 100
Keyword off : 100
Keyword right : 100
Keyword left : 100
Keyword yes : 100
Keyword on : 100
Keyword go : 100
Keyword down : 100
Keyword no : 100
Keyword stop : 100


In [5]:
samples["yes"]

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
36427,d3360af4c533a298411d4666ea84b8fafa928add9a8fd5...,common_voice_en_498172.mp3,"Yes, that's what love is.",2,0,thirties,male,us
22852,7f0f33a88f90b8b1dba2746b63dcc6057a63503c553bc3...,common_voice_en_18457232.mp3,"Yes, Saturday should be fine.",2,0,,,
94174,90a2539a7010c22f8d1379231f2b3e5829a0e645060d82...,common_voice_en_17892065.mp3,"Yes, this is the right place.",2,0,fourties,male,
84366,344908c02bd9ae437d19d5b7ba3fe238a770da115d0a54...,common_voice_en_18598786.mp3,"""Yes,"" he exclaimed, ""what have you done?""",2,0,thirties,male,us
50878,cb5b2099e6c052672cbe9cb4f4118266e16ddd09060216...,common_voice_en_17255675.mp3,"Yes, that is what I was thinking too.",2,0,fifties,male,england
...,...,...,...,...,...,...,...,...
20473,97ea49f9346daf626d1526914b4f5d40e094a450421406...,common_voice_en_17265425.mp3,"Yes, this is the right place.",2,0,,,
33562,a9fc8a211566e63b0b60e124c5ed5b0ae01d3640699a80...,common_voice_en_17270605.mp3,"Yes, it would be.",2,1,fourties,male,us
102997,379dd19cb981e4e0c55e088d74083093ee704d0d2332e6...,common_voice_en_18536587.mp3,"Yes, we know — what do we know, monsieur?",2,0,fourties,male,australia
80518,a7b1db5f82512a77c669d789c4d8d94060eca2838e3ffa...,common_voice_en_18649833.mp3,"Yes, I saw Angela's point of view.",2,0,thirties,male,us


In [7]:
print(os.getcwd())

/home/mark/tinyspeech_harvard/tinyspeech/alignment


In [6]:
COMMON_VOICE = "../../common_voice/en/clips/"
WORKDIR = "../../alignment_processing/"

In [8]:
alignment_dir = os.path.abspath(".")
abs_workdir = os.path.abspath(WORKDIR)
cmd = f"""docker run --rm -v {abs_workdir}:/work/  \
         -v {alignment_dir}:/lexicon/ -t montreal  \
         bin/mfa_align /work/input /lexicon/librispeech-lexicon.txt pretrained_models/english.zip /work/output/"""
print(cmd)
shlex.split(cmd)

docker run --rm -v /home/mark/tinyspeech_harvard/alignment_processing:/work/           -v /home/mark/tinyspeech_harvard/tinyspeech/alignment:/lexicon/ -t montreal           bin/mfa_align /work/input /lexicon/librispeech-lexicon.txt pretrained_models/english.zip /work/output/


['docker',
 'run',
 '--rm',
 '-v',
 '/home/mark/tinyspeech_harvard/alignment_processing:/work/',
 '-v',
 '/home/mark/tinyspeech_harvard/tinyspeech/alignment:/lexicon/',
 '-t',
 'montreal',
 'bin/mfa_align',
 '/work/input',
 '/lexicon/librispeech-lexicon.txt',
 'pretrained_models/english.zip',
 '/work/output/']

In [37]:
p = subprocess.Popen(
    shlex.split(cmd),
    stderr=subprocess.PIPE,
    stdout=subprocess.PIPE,
)

In [38]:
sout, serr = p.communicate()
print(sout.decode("UTF-8"), serr.decode("UTF-8"), p.returncode)

Setting up corpus information...
Number of speakers in corpus: 1, average number of utterances per speaker: 1.0
Creating dictionary information...
Setting up training data...
Calculating MFCCs...
Calculating CMVN...
Number of speakers in corpus: 1, average number of utterances per speaker: 1.0
Done with setup.
100% 2/2 [00:01<00:00,  1.64it/s]
Done! Everything took 3.6235108375549316 seconds
  0


In [40]:
tgs = glob.glob(f"{WORKDIR}/output/input/*.TextGrid")
tgs

['../../alignment_processing//output/input/common_voice_en_19620878.TextGrid']

Got this one wrong: `common_voice_en_18668331.mp3` - lot of noise in background

In [41]:
tg = textgrid.TextGrid.fromFile(tgs[0])

for i in range(len(tg[0])):
    print(tg[0][i])

Interval(0.0, 1.11, None)
Interval(1.11, 1.22, the)
Interval(1.22, 1.79, monster)
Interval(1.79, 2.15, tells)
Interval(2.15, 2.75, henry)
Interval(2.75, 2.97, and)
Interval(2.97, 3.62, elizabeth)
Interval(3.62, 4.34, yes)
Interval(4.34, 5.64, None)


In [35]:
row = samples["yes"].sample(n=1)
row

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
27556,4446019f2b7fe613de60a3b080993693ea6f453be9ffb6...,common_voice_en_19620878.mp3,The Monster tells Henry and Elizabeth Yes!,2,0,sixties,female,us


In [31]:
wavs = glob.glob("/home/mark/tinyspeech_harvard/alignment_processing/input/*.wav")
txts = glob.glob("/home/mark/tinyspeech_harvard/alignment_processing/input/*.txt")
tgs = glob.glob("/home/mark/tinyspeech_harvard/alignment_processing/output/input/*.TextGrid")
for f in wavs + txts + tgs:
    os.remove(f)
try:
    os.rmdir("/home/mark/tinyspeech_harvard/alignment_processing/output/input/")
except FileNotFoundError:
    pass
try:
    os.remove("/home/mark/tinyspeech_harvard/alignment_processing/output/oovs_found.txt")
except FileNotFoundError:
    pass

In [36]:
mp3_path = COMMON_VOICE + row.path.item()
print(mp3_path)
filename_noext = os.path.basename(os.path.splitext(mp3_path)[0])

dest = f"{WORKDIR}/input/{filename_noext}.wav"

transformer = sox.Transformer()
transformer.convert(samplerate=16000)  # from 48K mp3s
#transformer.trim(start_s, end_s)
#transformer.fade(fade_in_len=0.1, fade_out_len=0.1)
transformer.build(mp3_path, dest)

utterance = row.sentence.item()
transcription = f"{WORKDIR}/input/{filename_noext}.txt"
print(transcription)
with open(transcription, 'w') as fh:
    fh.write(utterance)

../../common_voice/en/clips/common_voice_en_19620878.mp3
../../alignment_processing//input/common_voice_en_19620878.txt
