## Pocketsphinx Speech to Text
Use Stream.start and Stream.stop to record a snippet of audio from the microphone to be decoded.

In [12]:
import numpy as np
import sys
import queue
import math
import os
import wave
import audioop
from collections import deque
import math
import sounddevice as sd # a nice wrapper around pyaudio that simplifies some things
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *

In [13]:
# modify according to where the pocketsphinx models and test data are
MODELDIR = "./models/"
DATADIR = "../corpus/"
HMMDIR = ""
LMDIR = ""
DICTDIR = ""

In [18]:
# Audio input stream config.
print(sd.query_devices())
DEV = sd.default.device
CHUNK = 1024  # chunks of bytes to read each time from mic
DTYPE = 'int16' # sphinx expects int16 audio format
CH = 1 # num channels
SR = 16000
LATENCY = 0.1
SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where only silence is recorded. 
                   # When this time passes the recording finishes and the audio snippet is decoded
PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise is detected, how much of previously 
                  # recorded audio is prepended. This helps to prevent chopping the beginning of the phrase.
THRESHOLD = 4500  # integer sample value threshhold for 'non-silence'
num_phrases = -1

>  0 Built-in Microphone, Core Audio (2 in, 0 out)
<  1 Built-in Output, Core Audio (0 in, 2 out)
   2 Soundflower (2ch), Core Audio (2 in, 2 out)
   3 Soundflower (64ch), Core Audio (64 in, 64 out)
   4 Premiere Pro 5.0, Core Audio (0 in, 0 out)
   5 H2Core, Core Audio (0 in, 2 out)
   6 USBMixer, Core Audio (0 in, 0 out)
   7 Soundblaster PLAY!, Core Audio (0 in, 0 out)
   8 Builtin+SF, Core Audio (64 in, 66 out)
   9 Saffire+SF, Core Audio (64 in, 64 out)
  10 FA101+SF, Core Audio (64 in, 64 out)


In [19]:
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', os.path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))

In [20]:
# Creaders decoder object for streaming data.
decoder = Decoder(config)

In [25]:
def AUDIOCB(indata, outdata, frames, time, status):
    global it
    if status:
        print(status, file=sys.stderr)
    outdata[:] = indata

In [26]:
# Setup Mic input
print("Getting intensity values from mic.")
audiost = sd.Stream(device=(DEV,DEV), samplerate=SR, latency=LATENCY, blocksize=CHUNK, dtype=DTYPE, channels=CH, callback=AUDIOCB)

audiost.start()

Getting intensity values from mic.


input underflow
input underflow


In [27]:
audiost.stop()

In [28]:
from pocketsphinx import LiveSpeech, get_model_path

model_path = get_model_path()

speech = LiveSpeech(
    verbose=True,
    sampling_rate=16000,
    buffer_size=2048,
    no_search=False,
    full_utt=False,
    hmm=os.path.join(model_path, 'en-us'),
    lm=os.path.join(model_path, 'en-us.lm.bin'),
    dic=os.path.join(model_path, 'cmudict-en-us.dict')
)

for phrase in speech:
    print(phrase.segments(detailed=True))

[('<s>', -9, 16673, 16733), ('<sil>', -3, 16734, 16770), ('just', -70, 16771, 16815), ('clothes', -11823, 16816, 16878), ('at', -5853, 16879, 16938), ('<sil>', -464, 16939, 16955), ('blue', -34052, 16956, 16991), ('and', -29433, 16992, 17050), ('baja', 0, 17051, 17176)]
[('<s>', 0, 17175, 17177), ('yeah', 0, 17178, 17307)]
[('<s>', -3, 171420, 171425), ('the', -23284, 171426, 171445), ('eye', -28594, 171446, 171505), ('</s>', 0, 171506, 171538)]
[('<s>', 0, 182636, 182638), ('on(2)', -1416, 182639, 182679), ('a', -4884, 182680, 182683), ('blog', 0, 182684, 182745)]
[('<s>', 0, 182745, 182747), ('dude', 0, 182748, 182887)]
[('<s>', -2, 185126, 185148), ('because(3)', -11741, 185149, 185201), ('i', -7649, 185202, 185216), ('got', -335, 185217, 185357), ('</s>', 0, 185358, 185362)]
[('<s>', -8, 312444, 312463), ('the', -6370, 312464, 312481), ('fact', -2852, 312482, 312532), ('is', -5769, 312533, 312557), ('gratifying', 0, 312558, 312690)]
[('<s>', 0, 312694, 312696), ('when', -8962, 3126