In [2]:
import ipywidgets as widgets
from IPython.display import display
from queue import Queue
from threading import Thread

messages = Queue() #tell the thread when to stop recording
recordings = Queue() #store the audio from the mic. and pass to transcription

record_button = widgets.Button(
    description = "Record",
    disabled = False,
    button_style = "success",
    icon = "microphone"
)

stop_button = widgets.Button(
    description = "Stop",
    disabled = False,
    button_style = "warning",
    icon = "stop"
)

output = widgets.Output()

def start_recording(data):
    messages.put(True) #keep running & recording the mic.

    #put a message onto the messages queue which tells the threads to keep running
    with output:
        display("Starting...")
        record = Thread(target = record_mic) #calls a thread that will record the mic
        record.start() #start thread and have it record in the background
        
        transcribe = Thread(target=speech_recognition, args=(output,))
        transcribe.start() #transcribe audio into text
    

def stop_recording(data):
    with output:
        messages.get() #takes the message off the queue
        display("Stopped.")

record_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(record_button, stop_button) 

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle())



In [3]:
import pyaudio #microphone index = 0
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))
    
p.terminate()

{'index': 0, 'structVersion': 2, 'name': 'MacBook Pro Microphone', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.03235416666666667, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.0416875, 'defaultHighOutputLatency': 0.1, 'defaultSampleRate': 48000.0}
{'index': 1, 'structVersion': 2, 'name': 'MacBook Pro Speakers', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.017958333333333333, 'defaultHighInputLatency': 0.1, 'defaultHighOutputLatency': 0.027291666666666665, 'defaultSampleRate': 48000.0}


In [4]:
CHANNELS = 1
FRAME_RATE = 16000
RECORD_SECONDS = 20
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2

def record_mic(chunk=1024):
    p = pyaudio.PyAudio()
    
    stream = p.open(format=AUDIO_FORMAT,
                    channels=CHANNELS,
                    rate=FRAME_RATE,
                    input=True,
                    input_device_index=0,
                    frames_per_buffer=chunk)
    frames = []
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)
        
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk:
            recordings.put(frames.copy())
            frames = []
    
    stream.stop_stream()
    stream.close()
    p.terminate()
    

In [5]:
import subprocess
import json
from vosk import Model, KaldiRecognizer

model = Model(model_name="vosk-model-en-us-0.22")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)

def speech_recognition(output):
    while not messages.empty():
        frames = recordings.get() # takes audio from the previous function to be used in the speech rec engine
        
        rec.AcceptWaveform(b''.join(frames))
        result = rec.Result()
        text = json.loads(result)["text"]
        
        cased = subprocess.check_output("python recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)
        output.append_stdout(cased)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:11:12:13:14:15
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /Users/jasonqiu/.cache/vosk/vosk-model-en-us-0.22/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from /Users/jasonqiu/.cache/vosk/vosk-model-en-us-0.22/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from /Users/jasonqiu/.cache/vosk/vosk-model-en-us-0.22/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo /Users/jasonqiu/.cache/vosk/v