This notebook uses sphinx via pocketsphinx to convert continuous recording of speech to a set of phonemes. On Mac you can install pocketsphinx with `brew install cmu-pocketsphinx`. This only works on audio that is 16KHz, and works at something like 5% realtime rate. So a 10 minute file might time 4 hours.

In [None]:
data_root = 'data/speech'
sphinx_model_root = 'model' # point this to the folder called "model"

In [None]:
from os import environ, path
from os.path import join
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
from tqdm import tqdm
from utils import *
import librosa
import os
import numpy as np

In [None]:
config = Decoder.default_config()
config.set_string('-hmm', join(sphinx_model_root, 'en-us/en-us/'))
config.set_string('-allphone', join(sphinx_model_root, 'en-us/en-us-phone.lm.bin'))
decoder = Decoder(config)

In [None]:
stream = open(join(data_root, 'raw/audio.wav'))
total = os.fstat(stream.fileno()).st_size
pbar = tqdm(total=total)
buf_size = 1024
decoder.start_utt()
while True:
    buf = stream.read(buf_size)
    pbar.update(buf_size)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break
decoder.end_utt()

In [None]:
y, sr = librosa.load(fn, sr=16000)
frame_size = len(y) / decoder.n_frames()
print frame_size

In [None]:
for i, seg in enumerate(tqdm(decoder.seg(), leave=True)):
    start_sample = frame_size * seg.start_frame
    stop_sample = frame_size * seg.end_frame
    cur = y[start_sample:stop_sample]
    ffmpeg_save_audio(join(data_root, 'samples/{}.wav'.format(i)), cur, sr=sr)