https://www.gradio.app/guides/real-time-speech-recognition

In [1]:
import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")


def transcribe(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    print("stream", stream)
    print("new_chunk", new_chunk)
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]


demo = gr.Interface(
    transcribe,
    ["state", gr.Audio(sources=["microphone"], streaming=True)],
    ["state", "text"],
    live=True,
)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




stream None
new_chunk (48000, array([212, 227, 208, ..., -16, -25, -20], dtype=int16))
stream [ 0.05237154  0.05607707  0.0513834  ... -0.00395257 -0.00617589
 -0.00494071]
new_chunk (48000, array([-17, -18, -18, ...,  63,  84,  71], dtype=int16))
stream [0.05237154 0.05607707 0.0513834  ... 0.00736928 0.00982571 0.00830507]
new_chunk (48000, array([ 76,  72,  80, ..., 703, 726, 768], dtype=int16))
stream [0.05237154 0.05607707 0.0513834  ... 0.09410977 0.09718876 0.10281125]
new_chunk (48000, array([  790,   784,   781, ..., -3505, -3512, -3583], dtype=int16))
stream [ 0.05237154  0.05607707  0.0513834  ... -0.6946096  -0.6959968
 -0.7100674 ]
new_chunk (48000, array([-3597, -3584, -3645, ...,     6,     3,    17], dtype=int16))
stream [0.05237154 0.05607707 0.0513834  ... 0.0016225  0.00081125 0.00459708]
new_chunk (48000, array([ 10,  19,  22, ..., -19, -18, -15], dtype=int16))
stream [ 0.05237154  0.05607707  0.0513834  ... -0.22891566 -0.21686748
 -0.18072289]
new_chunk (48000, ar

### update interface only when special word occurs

In [3]:
import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")


def transcribe(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    print("stream", stream)
    print("new_chunk", new_chunk)

    stream["history"] = np.concatenate([stream["history"], y])
    predicted_text = transcriber(
        {
            "sampling_rate": sr,
            "raw": stream["history"],
        }
    )["text"]
    
    render_text = predicted_text
    if stream["prev_text"] != predicted_text:
        stream["prev_text"] = predicted_text
    else:
        render_text = stream["prev_text"]

    return (
        stream,
        render_text,
    )


demo = gr.Interface(
    transcribe,
    [
        gr.State(value={"history": np.array([]), "prev_text": ""}),
        gr.Audio(sources=["microphone"], streaming=True),
    ],
    [gr.State(), "text"],
    # live=True,
)

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




stream {'history': array([], dtype=float64)}
new_chunk (48000, array([  0,   0,   0, ...,  -3,  -9, -15], dtype=int16))
stream {'history': array([ 0.       ,  0.       ,  0.       , ..., -0.0056926, -0.0170778,
       -0.028463 ])}
new_chunk (48000, array([  -14,   -20,   -16, ..., -3350, -3489, -3486], dtype=int16))
stream {'history': array([ 0.        ,  0.        ,  0.        , ..., -0.55089623,
       -0.57375431, -0.57326096])}
new_chunk (48000, array([-3497, -3539, -3456, ...,    16,    22,    40], dtype=int16))
stream {'history': array([0.        , 0.        , 0.        , ..., 0.00442356, 0.00608239,
       0.01105889])}
new_chunk (48000, array([ 33,  28,  24, ..., -30, -43, -40], dtype=int16))
stream {'history': array([ 0.        ,  0.        ,  0.        , ..., -0.34090909,
       -0.48863637, -0.45454547])}
new_chunk (48000, array([-47, -53, -55, ..., -21, -14, -19], dtype=int16))
stream {'history': array([ 0.        ,  0.        ,  0.        , ..., -0.28      ,
       -0.186