-
Notifications
You must be signed in to change notification settings - Fork 28k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding utilities to chunk large audio files and read directly from microphone #14250
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,312 @@ | ||||||
import collections | ||||||
import platform | ||||||
import subprocess | ||||||
from typing import Optional, Tuple, Union | ||||||
|
||||||
import numpy as np | ||||||
|
||||||
|
||||||
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | ||||||
""" | ||||||
Helper function to read an audio file through ffmpeg. | ||||||
""" | ||||||
ar = f"{sampling_rate}" | ||||||
ac = "1" | ||||||
format_for_conversion = "f32le" | ||||||
ffmpeg_command = [ | ||||||
"ffmpeg", | ||||||
"-i", | ||||||
"pipe:0", | ||||||
"-ac", | ||||||
ac, | ||||||
"-ar", | ||||||
ar, | ||||||
"-f", | ||||||
format_for_conversion, | ||||||
"-hide_banner", | ||||||
"-loglevel", | ||||||
"quiet", | ||||||
"pipe:1", | ||||||
] | ||||||
|
||||||
try: | ||||||
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) | ||||||
except FileNotFoundError: | ||||||
raise ValueError("ffmpeg was not found but is required to load audio files from filename") | ||||||
output_stream = ffmpeg_process.communicate(bpayload) | ||||||
out_bytes = output_stream[0] | ||||||
|
||||||
audio = np.frombuffer(out_bytes, np.float32) | ||||||
if audio.shape[0] == 0: | ||||||
raise ValueError("Malformed soundfile") | ||||||
return audio | ||||||
|
||||||
|
||||||
def ffmpeg_stream(filename: str, sampling_rate: int, format_for_conversion: str, chunk_length_s: float): | ||||||
""" | ||||||
Helper function to read an audio file through ffmpeg. | ||||||
""" | ||||||
ar = f"{sampling_rate}" | ||||||
ac = "1" | ||||||
bufsize = 10 ** 8 | ||||||
if format_for_conversion == "s16le": | ||||||
dtype = np.int16 | ||||||
size_of_sample = 2 | ||||||
elif format_for_conversion == "f32le": | ||||||
dtype = np.float32 | ||||||
size_of_sample = 4 | ||||||
else: | ||||||
raise ValueError("Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") | ||||||
ffmpeg_command = [ | ||||||
"ffmpeg", | ||||||
"-i", | ||||||
filename, | ||||||
"-ac", | ||||||
ac, | ||||||
"-ar", | ||||||
ar, | ||||||
"-f", | ||||||
format_for_conversion, | ||||||
"-hide_banner", | ||||||
"-loglevel", | ||||||
"quiet", | ||||||
"pipe:1", | ||||||
] | ||||||
|
||||||
buflen = int(sampling_rate * chunk_length_s * size_of_sample) | ||||||
return _ffmpeg_stream(ffmpeg_command, bufsize, buflen, dtype) | ||||||
|
||||||
|
||||||
def ffmpeg_microphone( | ||||||
sampling_rate: int, format_for_conversion: str, chunk_length_s: float, stream_chunk_ms: Optional[int] = None | ||||||
): | ||||||
""" | ||||||
Helper function to read an audio file through ffmpeg. | ||||||
""" | ||||||
ar = f"{sampling_rate}" | ||||||
ac = "1" | ||||||
bufsize = 10 ** 8 | ||||||
if format_for_conversion == "s16le": | ||||||
dtype = np.int16 | ||||||
size_of_sample = 2 | ||||||
elif format_for_conversion == "f32le": | ||||||
dtype = np.float32 | ||||||
size_of_sample = 4 | ||||||
else: | ||||||
raise ValueError("Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") | ||||||
|
||||||
system = platform.system() | ||||||
if system == "Linux": | ||||||
format_ = "alsa" | ||||||
input_ = "default" | ||||||
elif system == "Darwin": | ||||||
format_ = "avfoundation" | ||||||
input_ = ":0" | ||||||
elif system == "Windows": | ||||||
format_ = "dshow" | ||||||
input_ = "default" | ||||||
|
||||||
ffmpeg_command = [ | ||||||
"ffmpeg", | ||||||
"-f", | ||||||
format_, | ||||||
"-i", | ||||||
input_, | ||||||
"-ac", | ||||||
ac, | ||||||
"-ar", | ||||||
ar, | ||||||
"-f", | ||||||
format_for_conversion, | ||||||
"-fflags", | ||||||
"nobuffer", | ||||||
"-hide_banner", | ||||||
"-loglevel", | ||||||
"quiet", | ||||||
"pipe:1", | ||||||
] | ||||||
|
||||||
if stream_chunk_ms is not None: | ||||||
buflen = int(round(sampling_rate * stream_chunk_ms * size_of_sample / 1000)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
would be nicer IMO |
||||||
bufout = int(round(sampling_rate * chunk_length_s * size_of_sample)) | ||||||
else: | ||||||
buflen = int(round(sampling_rate * chunk_length_s * size_of_sample)) | ||||||
bufout = None | ||||||
return _ffmpeg_stream(ffmpeg_command, bufsize, buflen, dtype, bufout=bufout) | ||||||
|
||||||
|
||||||
def _ffmpeg_stream(ffmpeg_command, bufsize, buflen, dtype, bufout=None): | ||||||
if bufout is None: | ||||||
bufout = buflen | ||||||
elif buflen > bufout: | ||||||
raise ValueError("bufout needs to be larger than buflen") | ||||||
|
||||||
try: | ||||||
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=-1) | ||||||
except FileNotFoundError: | ||||||
raise ValueError("ffmpeg was not found but is required to stream audio files from filename") | ||||||
|
||||||
acc = b"" | ||||||
while True: | ||||||
raw = ffmpeg_process.stdout.read(buflen) | ||||||
if raw == b"": | ||||||
break | ||||||
|
||||||
if len(acc) + len(raw) > bufout: | ||||||
acc = raw | ||||||
else: | ||||||
acc += raw | ||||||
audio = np.frombuffer(acc, dtype=dtype) | ||||||
yield audio | ||||||
|
||||||
|
||||||
# Taken from https://github.com/wiseman/py-webrtcvad/blob/master/example.py | ||||||
class Frame(object): | ||||||
"""Represents a "frame" of audio data.""" | ||||||
|
||||||
def __init__(self, bytes, timestamp, duration): | ||||||
self.bytes = bytes | ||||||
self.timestamp = timestamp | ||||||
self.duration = duration | ||||||
|
||||||
|
||||||
# Taken from https://github.com/wiseman/py-webrtcvad/blob/master/example.py | ||||||
def frame_generator(frame_duration_ms, audio_generator, sample_rate): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would also prefer |
||||||
""" | ||||||
Generates audio frames from PCM audio data. Takes the desired frame duration in milliseconds, the PCM data, and the | ||||||
sample rate. Yields Frames of the requested duration. | ||||||
""" | ||||||
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) | ||||||
duration = (float(n) / sample_rate) / 2.0 | ||||||
timestamp = 0.0 | ||||||
for audio in audio_generator: | ||||||
offset = 0 | ||||||
while offset + n < len(audio): | ||||||
yield Frame(audio[offset : offset + n], timestamp, duration) | ||||||
timestamp += duration | ||||||
offset += n | ||||||
|
||||||
|
||||||
# Taken from https://github.com/wiseman/py-webrtcvad/blob/master/example.py | ||||||
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): | ||||||
""" | ||||||
Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only the voiced | ||||||
audio. Uses a padded, sliding window algorithm over the audio frames. When more than 90% of the frames in the | ||||||
window are voiced (as reported by the VAD), the collector triggers and begins yielding audio frames. Then the | ||||||
collector waits until 90% of the frames in the window are unvoiced to detrigger. The window is padded at the front | ||||||
and back to provide a small amount of silence or the beginnings/endings of speech around the voiced frames. | ||||||
|
||||||
Arguments: | ||||||
sample_rate - The audio sample rate, in Hz. | ||||||
frame_duration_ms - The frame duration in milliseconds. | ||||||
padding_duration_ms - The amount to pad the window, in milliseconds. | ||||||
vad - An instance of webrtcvad.Vad. | ||||||
frames - a source of audio frames (sequence or generator). | ||||||
Returns: A generator that yields PCM audio data. | ||||||
""" | ||||||
num_padding_frames = int(padding_duration_ms / frame_duration_ms) | ||||||
# We use a deque for our sliding window/ring buffer. | ||||||
ring_buffer = collections.deque(maxlen=num_padding_frames) | ||||||
# We have two states: TRIGGERED and NOTTRIGGERED. We start in the | ||||||
# NOTTRIGGERED state. | ||||||
triggered = False | ||||||
|
||||||
voiced_frames = [] | ||||||
for frame in frames: | ||||||
is_speech = vad.is_speech(frame.bytes, sample_rate) | ||||||
|
||||||
if not triggered: | ||||||
ring_buffer.append((frame, is_speech)) | ||||||
num_voiced = len([f for f, speech in ring_buffer if speech]) | ||||||
# If we're NOTTRIGGERED and more than 90% of the frames in | ||||||
# the ring buffer are voiced frames, then enter the | ||||||
# TRIGGERED state. | ||||||
if num_voiced > 0.9 * ring_buffer.maxlen: | ||||||
triggered = True | ||||||
# sys.stdout.write("+(%s)" % (ring_buffer[0][0].timestamp,)) | ||||||
# We want to yield all the audio we see from now until | ||||||
# we are NOTTRIGGERED, but we have to start with the | ||||||
# audio that's already in the ring buffer. | ||||||
for f, s in ring_buffer: | ||||||
voiced_frames.append(f) | ||||||
ring_buffer.clear() | ||||||
else: | ||||||
# We're in the TRIGGERED state, so collect the audio data | ||||||
# and add it to the ring buffer. | ||||||
voiced_frames.append(frame) | ||||||
ring_buffer.append((frame, is_speech)) | ||||||
num_unvoiced = len([f for f, speech in ring_buffer if not speech]) | ||||||
# If more than 90% of the frames in the ring buffer are | ||||||
# unvoiced, then enter NOTTRIGGERED and yield whatever | ||||||
# audio we've collected. | ||||||
if num_unvoiced > 0.9 * ring_buffer.maxlen: | ||||||
triggered = False | ||||||
yield b"".join([f.bytes for f in voiced_frames]) | ||||||
ring_buffer.clear() | ||||||
voiced_frames = [] | ||||||
# If we have any leftover voiced audio when we run out of input, | ||||||
# yield it. | ||||||
if voiced_frames: | ||||||
yield b"".join([f.bytes for f in voiced_frames]) | ||||||
|
||||||
|
||||||
def vad_files(filenames, sampling_rate: int, chunk_length_s: float): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add in another PR - think the default streaming works quite well without it no? |
||||||
try: | ||||||
import webrtcvad | ||||||
except ImportError: | ||||||
raise ValueError( | ||||||
"webrtcvad was not found but is required to chunk on voice activation, `pip install webrtcvad`." | ||||||
) | ||||||
|
||||||
for filename in filenames: | ||||||
if not isinstance(filename, str): | ||||||
raise ValueError("Chunk voice can only operate on large filenames") | ||||||
|
||||||
inputs = ffmpeg_stream(filename, sampling_rate, format_for_conversion="s16le", chunk_length_s=chunk_length_s) | ||||||
vad = webrtcvad.Vad(0) | ||||||
frames = frame_generator(10, inputs, sampling_rate) | ||||||
segments = vad_collector(sampling_rate, 10, 300, vad, frames) | ||||||
max_int16 = 2 ** 15 | ||||||
max_len = int(round(chunk_length_s * sampling_rate)) | ||||||
for i, segment in enumerate(segments): | ||||||
audio = np.frombuffer(segment, dtype=np.int16).astype("float32") / max_int16 | ||||||
for i in range(0, audio.shape[0], max_len): | ||||||
chunk = audio[i : i + max_len] | ||||||
yield chunk | ||||||
|
||||||
|
||||||
def chunk_files( | ||||||
filenames, | ||||||
sampling_rate: int, | ||||||
chunk_length_s: float, | ||||||
stride_length_s: Optional[Union[float, Tuple[float, float]]] = None, | ||||||
): | ||||||
if stride_length_s is None: | ||||||
stride_length_s = [chunk_length_s / 6, chunk_length_s / 6] | ||||||
if isinstance(stride_length_s, (float, int)): | ||||||
stride_length_s = [stride_length_s, stride_length_s] | ||||||
|
||||||
chunk_len = int(round(sampling_rate * chunk_length_s)) | ||||||
stride_left_ = int(round(sampling_rate * stride_length_s[0])) | ||||||
stride_right = int(round(sampling_rate * stride_length_s[1])) | ||||||
stride_left = 0 | ||||||
|
||||||
for filename in filenames: | ||||||
if not isinstance(filename, str): | ||||||
raise ValueError("Chunk voice can only operate on large filenames") | ||||||
|
||||||
chunk = np.zeros((0,), dtype=np.float32) | ||||||
for audio in ffmpeg_stream( | ||||||
filename, sampling_rate, format_for_conversion="f32le", chunk_length_s=chunk_length_s | ||||||
): | ||||||
chunk = np.concatenate([chunk, audio]) | ||||||
while chunk.shape[0] >= chunk_len: | ||||||
to_send = chunk[:chunk_len] | ||||||
yield {"raw": to_send, "stride": (stride_left, stride_right)} | ||||||
# Start striding left | ||||||
if stride_left == 0: | ||||||
stride_left = stride_left_ | ||||||
chunk = chunk[chunk_len - stride_right - stride_left :] | ||||||
# last chunk only if there's actual content | ||||||
if chunk.shape[0] > stride_left: | ||||||
yield {"raw": chunk, "stride": (stride_left, 0)} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we maybe have
stream_chunk_ms
in seconds as well @Narsil given that everything else is in seconds?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also could we also make use of the stride logic here?