From 5cd82914c1d3ce8cc856906643b8e61607af5e4e Mon Sep 17 00:00:00 2001 From: iory Date: Sat, 11 Jun 2022 21:14:47 +0900 Subject: [PATCH 01/18] [respeaker_ros] Publish audio_info --- respeaker_ros/scripts/respeaker_node.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index f95f60866..a40db9c90 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -16,6 +16,7 @@ import sys import time from audio_common_msgs.msg import AudioData +from audio_common_msgs.msg import AudioInfo from geometry_msgs.msg import PoseStamped from std_msgs.msg import Bool, Int32, ColorRGBA from dynamic_reconfigure.server import Server @@ -322,6 +323,8 @@ def __init__(self): self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True) self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True) self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10) + self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo, + queue_size=1, latch=True) self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10) # init config self.config = None @@ -337,6 +340,14 @@ def __init__(self): self.timer_led = None self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led) + info_msg = AudioInfo( + channels=self.n_channel, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth, + coding_format='WAVE') + self.pub_audio_info.publish(info_msg) + def on_shutdown(self): try: self.respeaker.close() From 950ba8d58b4a08c2d3734a499f675981d25e661f Mon Sep 17 00:00:00 2001 From: iory Date: Sat, 11 Jun 2022 21:19:00 +0900 Subject: [PATCH 02/18] [respeaker_ros] Enable speech to text for multi channel. --- respeaker_ros/scripts/speech_to_text.py | 32 ++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/respeaker_ros/scripts/speech_to_text.py b/respeaker_ros/scripts/speech_to_text.py index 439652ba8..44b7fcb75 100755 --- a/respeaker_ros/scripts/speech_to_text.py +++ b/respeaker_ros/scripts/speech_to_text.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- # Author: Yuki Furuta +from __future__ import division + import actionlib import rospy try: @@ -9,8 +11,10 @@ except ImportError as e: raise ImportError(str(e) + '\nplease try "pip install speechrecognition"') +import numpy as np from actionlib_msgs.msg import GoalStatus, GoalStatusArray from audio_common_msgs.msg import AudioData +from audio_common_msgs.msg import AudioInfo from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal from speech_recognition_msgs.msg import SpeechRecognitionCandidates @@ -18,8 +22,26 @@ class SpeechToText(object): def __init__(self): # format of input audio data - self.sample_rate = rospy.get_param("~sample_rate", 16000) - self.sample_width = rospy.get_param("~sample_width", 2) + if rospy.get_param('~audio_info', None): + rospy.loginfo('Extract audio info params from {}'.format( + rospy.get_param('~audio_info'))) + audio_info_msg = rospy.wait_for_message( + rospy.get_param('~audio_info'), AudioInfo) + self.sample_rate = audio_info_msg.sample_rate + self.sample_width = audio_info_msg.bitrate // self.sample_rate // 8 + self.channels = audio_info_msg.channels + else: + self.sample_rate = rospy.get_param("~sample_rate", 16000) + self.sample_width = rospy.get_param("~sample_width", 2) + self.channels = rospy.get_param("~channels", 1) + if self.sample_width == 2: + self.dtype = 'int16' + elif self.sample_width == 4: + self.dtype = 'int32' + else: + raise NotImplementedError('sample_width {} is not supported' + .format(self.sample_width)) + self.target_channel = rospy.get_param("~target_channel", 0) # language of STT service self.language = rospy.get_param("~language", "ja-JP") # ignore voice input while the robot is speaking @@ -78,7 +100,11 @@ def audio_cb(self, msg): if self.is_canceling: rospy.loginfo("Speech is cancelled") return - data = SR.AudioData(msg.data, self.sample_rate, self.sample_width) + + data = SR.AudioData( + np.frombuffer(msg.data, dtype=self.dtype)[ + self.target_channel::self.channels].tobytes(), + self.sample_rate, self.sample_width) try: rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_google( From dffe449e94acbfec513e3156e94233c5f3b50426 Mon Sep 17 00:00:00 2001 From: iory Date: Sat, 11 Jun 2022 21:30:30 +0900 Subject: [PATCH 03/18] [respeaker_ros] Add audio_info arg in sample --- respeaker_ros/launch/sample_respeaker.launch | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch index 31d083608..6741fd7c5 100644 --- a/respeaker_ros/launch/sample_respeaker.launch +++ b/respeaker_ros/launch/sample_respeaker.launch @@ -13,6 +13,8 @@ + + + respawn="true" respawn_delay="10" > + + publish_multichannel: $(arg publish_multichannel) + + - + + audio_info: $(arg audio_info) language: $(arg language) self_cancellation: $(arg self_cancellation) tts_tolerance: 0.5 From 227708eb855d3cf92b4854361c6521db2341604a Mon Sep 17 00:00:00 2001 From: iory Date: Sat, 11 Jun 2022 21:33:53 +0900 Subject: [PATCH 04/18] [respeaker_ros] Add publish_multichannel option for publishing multi channel audio --- respeaker_ros/scripts/respeaker_node.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index a40db9c90..66f49c256 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -222,7 +222,9 @@ def close(self): class RespeakerAudio(object): - def __init__(self, on_audio, channel=0, suppress_error=True): + def __init__(self, on_audio, channel=0, suppress_error=True, + publish_multichannel=False): + self.publish_multichannel = publish_multichannel self.on_audio = on_audio with ignore_stderr(enable=suppress_error): self.pyaudio = pyaudio.PyAudio() @@ -255,7 +257,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True): if self.channels != 6: rospy.logwarn("%d channel is found for respeaker" % self.channels) rospy.logwarn("You may have to update firmware.") - self.channel = min(self.channels - 1, max(0, self.channel)) self.stream = self.pyaudio.open( input=True, start=False, @@ -285,7 +286,10 @@ def stream_callback(self, in_data, frame_count, time_info, status): data = np.frombuffer(in_data, dtype=np.int16) chunk_per_channel = int(len(data) / self.channels) data = np.reshape(data, (chunk_per_channel, self.channels)) - chan_data = data[:, self.channel] + if self.publish_multichannel is True: + chan_data = data + else: + chan_data = data[:, self.channel] # invoke callback self.on_audio(chan_data.tobytes()) return None, pyaudio.paContinue @@ -307,6 +311,7 @@ def __init__(self): self.doa_xy_offset = rospy.get_param("~doa_xy_offset", 0.0) self.doa_yaw_offset = rospy.get_param("~doa_yaw_offset", 90.0) self.speech_prefetch = rospy.get_param("~speech_prefetch", 0.5) + self.publish_multichannel = rospy.get_param("~publish_multichannel", False) self.speech_continuation = rospy.get_param("~speech_continuation", 0.5) self.speech_max_duration = rospy.get_param("~speech_max_duration", 7.0) self.speech_min_duration = rospy.get_param("~speech_min_duration", 0.1) @@ -330,9 +335,16 @@ def __init__(self): self.config = None self.dyn_srv = Server(RespeakerConfig, self.on_config) # start - self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error) + self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error, + publish_multichannel=self.publish_multichannel) + self.n_channel = 1 + if self.publish_multichannel: + self.n_channel = self.respeaker_audio.channels self.speech_prefetch_bytes = int( - self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0) + self.n_channel + * self.speech_prefetch + * self.respeaker_audio.rate + * self.respeaker_audio.bitdepth / 8.0) self.speech_prefetch_buffer = b"" self.respeaker_audio.start() self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate), @@ -434,7 +446,7 @@ def on_timer(self, event): self.speech_audio_buffer = b"" self.is_speeching = False duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth - duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth + duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel rospy.loginfo("Speech detected for %.3f seconds" % duration) if self.speech_min_duration <= duration < self.speech_max_duration: From 206aaaadb7e1521fb0cd1db651be5f11f6e7cc3b Mon Sep 17 00:00:00 2001 From: iory Date: Sun, 12 Jun 2022 11:41:09 +0900 Subject: [PATCH 05/18] [respeaker_ros] Restore audio input to use argment's audio --- respeaker_ros/launch/sample_respeaker.launch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch index 6741fd7c5..33cb8c25c 100644 --- a/respeaker_ros/launch/sample_respeaker.launch +++ b/respeaker_ros/launch/sample_respeaker.launch @@ -33,7 +33,7 @@ - + audio_info: $(arg audio_info) From 7f40f504947de1c3612ae569440ebcc96cbc35c9 Mon Sep 17 00:00:00 2001 From: iory Date: Mon, 13 Jun 2022 16:21:04 +0900 Subject: [PATCH 06/18] [respeaker_ros] Add publish_multichannel option to fix launch file error. --- respeaker_ros/launch/sample_respeaker.launch | 2 ++ 1 file changed, 2 insertions(+) diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch index 33cb8c25c..d3e0badae 100644 --- a/respeaker_ros/launch/sample_respeaker.launch +++ b/respeaker_ros/launch/sample_respeaker.launch @@ -15,6 +15,8 @@ + + Date: Mon, 13 Jun 2022 16:30:11 +0900 Subject: [PATCH 07/18] [respeaker_ros] Add roslaunch_add_file_check to test launch file format --- respeaker_ros/CMakeLists.txt | 2 ++ respeaker_ros/package.xml | 1 + respeaker_ros/test/sample_respeaker.test | 1 + 3 files changed, 4 insertions(+) diff --git a/respeaker_ros/CMakeLists.txt b/respeaker_ros/CMakeLists.txt index 59c6034b0..f6b63469b 100644 --- a/respeaker_ros/CMakeLists.txt +++ b/respeaker_ros/CMakeLists.txt @@ -18,5 +18,7 @@ catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS} if(CATKIN_ENABLE_TESTING) find_package(rostest REQUIRED) + find_package(roslaunch REQUIRED) add_rostest(test/sample_respeaker.test) + roslaunch_add_file_check(launch/sample_respeaker.launch) endif() diff --git a/respeaker_ros/package.xml b/respeaker_ros/package.xml index a8d193b72..41a08fa4f 100644 --- a/respeaker_ros/package.xml +++ b/respeaker_ros/package.xml @@ -15,6 +15,7 @@ flac geometry_msgs std_msgs + sound_play speech_recognition_msgs tf python-numpy diff --git a/respeaker_ros/test/sample_respeaker.test b/respeaker_ros/test/sample_respeaker.test index a1ad1d143..3a9cb2c79 100644 --- a/respeaker_ros/test/sample_respeaker.test +++ b/respeaker_ros/test/sample_respeaker.test @@ -3,6 +3,7 @@ + From 296cc6ae3c326ec4cdf6c8f8487155017fb56ffb Mon Sep 17 00:00:00 2001 From: iory Date: Mon, 13 Jun 2022 16:41:55 +0900 Subject: [PATCH 08/18] [respeaker_ros] Set audio_info topic name and check length of it. --- respeaker_ros/scripts/speech_to_text.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/respeaker_ros/scripts/speech_to_text.py b/respeaker_ros/scripts/speech_to_text.py index 44b7fcb75..ea0e1c6fb 100755 --- a/respeaker_ros/scripts/speech_to_text.py +++ b/respeaker_ros/scripts/speech_to_text.py @@ -22,11 +22,12 @@ class SpeechToText(object): def __init__(self): # format of input audio data - if rospy.get_param('~audio_info', None): + audio_info_topic_name = rospy.get_param('~audio_info', '') + if len(audio_info_topic_name) > 0: rospy.loginfo('Extract audio info params from {}'.format( - rospy.get_param('~audio_info'))) + audio_info_topic_name)) audio_info_msg = rospy.wait_for_message( - rospy.get_param('~audio_info'), AudioInfo) + audio_info_topic_name, AudioInfo) self.sample_rate = audio_info_msg.sample_rate self.sample_width = audio_info_msg.bitrate // self.sample_rate // 8 self.channels = audio_info_msg.channels From 7d3e2c9f99c1770da884128aa66d35647126c887 Mon Sep 17 00:00:00 2001 From: iory Date: Mon, 13 Jun 2022 18:11:58 +0900 Subject: [PATCH 09/18] [respeaker_ros] Add comment for why we add publish_multichannel option. --- respeaker_ros/scripts/respeaker_node.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index 66f49c256..a4a093b4f 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -339,6 +339,12 @@ def __init__(self): publish_multichannel=self.publish_multichannel) self.n_channel = 1 if self.publish_multichannel: + # The respeaker has 4 or 6 microphones. + # Multiple microphones can be used for + # beam forming (strengthening the sound in a specific direction) + # and sound localization (the respeaker outputs the azimuth + # direction, but the multichannel can estimate + # the elevation direction). etc. self.n_channel = self.respeaker_audio.channels self.speech_prefetch_bytes = int( self.n_channel From 4b7632b529a220ee565a60f01b3f520a06cf89d5 Mon Sep 17 00:00:00 2001 From: iory Date: Mon, 13 Jun 2022 23:10:47 +0900 Subject: [PATCH 10/18] [respeaker_ros] Remove publish_multichannel option and publish raw multichannel audio as default. --- respeaker_ros/launch/sample_respeaker.launch | 5 -- respeaker_ros/scripts/respeaker_node.py | 74 +++++++++++++------- 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch index d3e0badae..e2c43c557 100644 --- a/respeaker_ros/launch/sample_respeaker.launch +++ b/respeaker_ros/launch/sample_respeaker.launch @@ -15,8 +15,6 @@ - - - - publish_multichannel: $(arg publish_multichannel) - 1: + # The respeaker has 4 microphones. + # Multiple microphones can be used for + # beam forming (strengthening the sound in a specific direction) + # and sound localization (the respeaker outputs the azimuth + # direction, but the multichannel can estimate + # the elevation direction). etc. + + # Channel 0: processed audio for ASR + # Channel 1: mic1 raw data + # Channel 2: mic2 raw data + # Channel 3: mic3 raw data + # Channel 4: mic4 raw data + # Channel 5: merged playback + # (self.n_channel - 2) = 4 channels are multiple microphones. + self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData, + queue_size=10) + self.pub_audio_merged_playback = rospy.Publisher( + "audio_merged_playback", AudioData, + queue_size=10) + info_raw_msg = AudioInfo( + channels=self.n_channel - 2, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=(self.respeaker_audio.rate * + self.respeaker_audio.bitdepth), + coding_format='WAVE') + self.pub_audio_raw_info.publish(info_raw_msg) + def on_shutdown(self): try: self.respeaker.close() @@ -403,13 +420,20 @@ def on_status_led(self, msg): oneshot=True) def on_audio(self, data): - self.pub_audio.publish(AudioData(data=data)) + # take processed audio for ASR. + processed_data = data[:, 0].tobytes() + self.pub_audio.publish(AudioData(data=processed_data)) + if self.n_channel > 1: + self.pub_audio_raw.publish( + AudioData(data=data[:, 1:5].reshape(-1).tobytes())) + self.pub_audio_merged_playback.publish( + AudioData(data=data[:, 5].tobytes())) if self.is_speeching: if len(self.speech_audio_buffer) == 0: self.speech_audio_buffer = self.speech_prefetch_buffer - self.speech_audio_buffer += data + self.speech_audio_buffer += processed_data else: - self.speech_prefetch_buffer += data + self.speech_prefetch_buffer += processed_data self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:] def on_timer(self, event): From f872a109f5f146540350802e08e1e61a7bf873d5 Mon Sep 17 00:00:00 2001 From: iory Date: Mon, 13 Jun 2022 23:19:55 +0900 Subject: [PATCH 11/18] [respeaker_ros] Publish spech_audio_raw --- respeaker_ros/scripts/respeaker_node.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index 70079ae5a..b4d40d194 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -383,6 +383,16 @@ def __init__(self): coding_format='WAVE') self.pub_audio_raw_info.publish(info_raw_msg) + self.speech_audio_raw_buffer = b"" + self.speech_raw_prefetch_buffer = b"" + self.pub_speech_audio_raw = rospy.Publisher( + "speech_audio_raw", AudioData, queue_size=10) + self.speech_raw_prefetch_bytes = int( + self.n_channel - 2 + * self.speech_prefetch + * self.respeaker_audio.rate + * self.respeaker_audio.bitdepth / 8.0) + def on_shutdown(self): try: self.respeaker.close() @@ -424,17 +434,26 @@ def on_audio(self, data): processed_data = data[:, 0].tobytes() self.pub_audio.publish(AudioData(data=processed_data)) if self.n_channel > 1: + raw_audio_data = data[:, 1:5].reshape(-1).tobytes() self.pub_audio_raw.publish( - AudioData(data=data[:, 1:5].reshape(-1).tobytes())) + AudioData(data=raw_audio_data)) self.pub_audio_merged_playback.publish( AudioData(data=data[:, 5].tobytes())) if self.is_speeching: if len(self.speech_audio_buffer) == 0: self.speech_audio_buffer = self.speech_prefetch_buffer + if self.n_channel > 1: + self.speech_audio_raw_buffer = self.speech_raw_prefetch_buffer self.speech_audio_buffer += processed_data + if self.n_channel > 1: + self.speech_audio_raw_buffer += raw_audio_data else: self.speech_prefetch_buffer += processed_data self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:] + if self.n_channel > 1: + self.speech_raw_prefetch_buffer += raw_audio_data + self.speech_raw_prefetch_buffer = self.speech_raw_prefetch_buffer[ + -self.speech_raw_prefetch_bytes:] def on_timer(self, event): stamp = event.current_real or rospy.Time.now() From 6e49927983967dc75b9d88362ec8fd0ba5aa6af5 Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 03:14:28 +0900 Subject: [PATCH 12/18] [respeaker_ros] Add comment to know more defails --- respeaker_ros/scripts/respeaker_node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index b4d40d194..cec2491f7 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -368,6 +368,8 @@ def __init__(self): # Channel 3: mic3 raw data # Channel 4: mic4 raw data # Channel 5: merged playback + # For more detail, please see + # https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/ # (self.n_channel - 2) = 4 channels are multiple microphones. self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData, queue_size=10) From e8ae9fe7f4cacd468c6160900e7daacdc00838f1 Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 03:14:40 +0900 Subject: [PATCH 13/18] [respeaker_ros] Publish speech audio raw --- respeaker_ros/scripts/respeaker_node.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index cec2491f7..3cab9fc8f 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -495,13 +495,15 @@ def on_timer(self, event): elif self.is_speeching: buf = self.speech_audio_buffer self.speech_audio_buffer = b"" + buf_raw = self.speech_audio_raw_buffer + self.speech_audio_raw_buffer = b"" self.is_speeching = False duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel rospy.loginfo("Speech detected for %.3f seconds" % duration) if self.speech_min_duration <= duration < self.speech_max_duration: - self.pub_speech_audio.publish(AudioData(data=buf)) + self.pub_speech_audio_raw.publish(AudioData(data=buf_raw)) if __name__ == '__main__': From 92477a52e0ece50853271d1596409b2ebbb8bb6a Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 03:15:16 +0900 Subject: [PATCH 14/18] [respeaker_ros] Add parameters for respaker ros --- respeaker_ros/README.md | 91 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md index 1bba594fc..39ae2e65b 100644 --- a/respeaker_ros/README.md +++ b/respeaker_ros/README.md @@ -87,6 +87,97 @@ A ROS Package for Respeaker Mic Array a: 0.3" ``` +## Parameters for respeaker_node.py + + - ### Publishing topics + + - `~audio` (`audio_common_msgs/AudioData`) + + Processed audio for ASR. 1 channel. + + - `~audio_info` (`audio_common_msgs/AudioInfo`) + + Audio info with respect to `~audio`. + + - `~audio_raw` (`audio_common_msgs/AudioData`) + + Micarray audio data has 4-channels. Maybe you need to update respeaker firmware. + + If the firmware isn't supported, this will not be output. + + - `~audio_info_raw` (`audio_common_msgs/AudioInfo`) + + Audio info with respect to `~audio_raw`. + + If the firmware isn't supported, this will not be output. + + - `~speech_audio` (`audio_common_msgs/AudioData`) + + Audio data while a person is speaking using the VAD function. + + - `~speech_audio_raw` (`audio_common_msgs/AudioData`) + + Audio data has 4-channels while a person is speaking using the VAD function. + + If the firmware isn't supported, this will not be output. + + - `~audio_merged_playback` (`audio_common_msgs/AudioData`) + + Data that combines the sound of mic and speaker. + + If the firmware isn't supported, this will not be output. + + For more detail, please see https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/ + + - `~is_speeching` (`std_msgs/Bool`) + + Using VAD function, publish whether someone is speaking. + + - `~sound_direction` (`std_msgs/Int32`) + + Direction of sound. + + - `~sound_localization` (`geometry_msgs/PoseStamped`) + + Localized Sound Direction. The value of the position in the estimated direction with `~doa_offset` as the radius is obtained. + + - ### Parameters + + - `~update_rate` (`Double`, default: `10.0`) + + Publishing info data such as `~is_speeching`, `~sound_direction`, `~sound_localization`, `~speech_audio` and `~speech_audio_raw`. + + - `~sensor_frame_id` (`String`, default: `respeaker_base`) + + Frame id. + + - `~doa_xy_offset` (`Double`, default: `0.0`) + + `~doa_offset` is a estimated sound direction's radius. + + - `~doa_yaw_offset` (`Double`, default: `90.0`) + + Estimated DoA angle offset. + + - `~speech_prefetch` (`Double`, default: `0.5`) + + Time to represent how long speech is pre-stored in buffer. + + - `~speech_continuation` (`Double`, default: `0.5`) + + If the time between the current time and the time when the speech is stopped is shorter than this time, + it is assumed that someone is speaking. + + - `~speech_max_duration` (`Double`, default: `7.0`) + + - `~speech_min_duration` (`Double`, default: `0.1`) + + If the speaking interval is within these times, `~speech_audio` and `~speech_audio_raw` will be published. + + - `~suppress_pyaudio_error` (`Bool`, default: `True`) + + If this value is `True`, suppress error from pyaudio. + ## Use cases ### Voice Recognition From 8caae81693ea8c7bb6d41aec397aaa21e5add2b3 Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 03:19:16 +0900 Subject: [PATCH 15/18] [respeaker_ros] Fixed publishing audio topic's namespace --- respeaker_ros/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md index 39ae2e65b..bea276285 100644 --- a/respeaker_ros/README.md +++ b/respeaker_ros/README.md @@ -91,37 +91,37 @@ A ROS Package for Respeaker Mic Array - ### Publishing topics - - `~audio` (`audio_common_msgs/AudioData`) + - `audio` (`audio_common_msgs/AudioData`) Processed audio for ASR. 1 channel. - - `~audio_info` (`audio_common_msgs/AudioInfo`) + - `audio_info` (`audio_common_msgs/AudioInfo`) Audio info with respect to `~audio`. - - `~audio_raw` (`audio_common_msgs/AudioData`) + - `audio_raw` (`audio_common_msgs/AudioData`) Micarray audio data has 4-channels. Maybe you need to update respeaker firmware. If the firmware isn't supported, this will not be output. - - `~audio_info_raw` (`audio_common_msgs/AudioInfo`) + - `audio_info_raw` (`audio_common_msgs/AudioInfo`) Audio info with respect to `~audio_raw`. If the firmware isn't supported, this will not be output. - - `~speech_audio` (`audio_common_msgs/AudioData`) + - `speech_audio` (`audio_common_msgs/AudioData`) Audio data while a person is speaking using the VAD function. - - `~speech_audio_raw` (`audio_common_msgs/AudioData`) + - `speech_audio_raw` (`audio_common_msgs/AudioData`) Audio data has 4-channels while a person is speaking using the VAD function. If the firmware isn't supported, this will not be output. - - `~audio_merged_playback` (`audio_common_msgs/AudioData`) + - `audio_merged_playback` (`audio_common_msgs/AudioData`) Data that combines the sound of mic and speaker. From 383670f44610c6524dccde2959da7d04e3c69d42 Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 03:31:55 +0900 Subject: [PATCH 16/18] [respeaker_ros] Add parameters for speech_to_text --- respeaker_ros/README.md | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md index bea276285..c2392c935 100644 --- a/respeaker_ros/README.md +++ b/respeaker_ros/README.md @@ -178,6 +178,60 @@ A ROS Package for Respeaker Mic Array If this value is `True`, suppress error from pyaudio. +## Parameters for speech_to_text.py + + - ### Publishing topics + + - `~speech_to_text` (`speech_recognition_msgs/SpeechRecognitionCandidates`) + + Recognized text. + + - ### Subscribing topics + + - `audio` (`audio_common_msgs/AudioData`) + + Input audio. + + - ### Parameters + + - `~audio_info` (`String`, default: ``) + + audio_info (`audio_common_msgs/AudioInfo`) topic. If this value is specified, `~sample_rate`, `~sample_width` and `~channels` parameters are obtained from the topic. + + - `~sample_rate` (`Int`, default: `16000`) + + Sampling rate. + + - `~sample_width` (`Int`, default: `2`) + + Sample with. + + - `~channels` (`Int`, default: `1`) + + Number of channels. + + - `~target_channel` (`Int`, default: `0`) + + Target number of channel. + + - `~language` (`String`, default: `ja-JP`) + + language of speech to text service. For English users, you can specify `en-US`. + + - `~self_cancellation` (`Bool`, default: `True`) + + ignore voice input while the robot is speaking. + + - `~tts_tolerance` (`String`, default: `1.0`) + + time to assume as SPEAKING after tts service is finished. + + - `~tts_action_names` (`List[String]`, default: `['sound_play']`) + + If `~self_chancellation` is `True`, this value will be used. + + When the actions are active, do nothing with the callback that subscribes to `audio`. + ## Use cases ### Voice Recognition From 2f87dbbabb22f7d1975bd0d603ff059b2784c77f Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 12:10:58 +0900 Subject: [PATCH 17/18] [respeaker_ros] Avoid AudioInfo import for backward compatibility --- respeaker_ros/scripts/respeaker_node.py | 45 +++++++++++++++---------- respeaker_ros/scripts/speech_to_text.py | 15 ++++++++- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index 3cab9fc8f..8b5c16312 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -16,7 +16,13 @@ import sys import time from audio_common_msgs.msg import AudioData -from audio_common_msgs.msg import AudioInfo +enable_audio_info = True +try: + from audio_common_msgs.msg import AudioInfo +except Exception as e: + rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.' + ' AudioInfo message will not be published.') + enable_audio_info = False from geometry_msgs.msg import PoseStamped from std_msgs.msg import Bool, Int32, ColorRGBA from dynamic_reconfigure.server import Server @@ -321,8 +327,9 @@ def __init__(self): self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True) self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True) self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10) - self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo, - queue_size=1, latch=True) + if enable_audio_info is True: + self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo, + queue_size=1, latch=True) self.pub_audio_raw_info = rospy.Publisher("audio_info_raw", AudioInfo, queue_size=1, latch=True) self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10) @@ -346,13 +353,14 @@ def __init__(self): self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led) # processed audio for ASR - info_msg = AudioInfo( - channels=1, - sample_rate=self.respeaker_audio.rate, - sample_format='S16LE', - bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth, - coding_format='WAVE') - self.pub_audio_info.publish(info_msg) + if enable_audio_info is True: + info_msg = AudioInfo( + channels=1, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth, + coding_format='WAVE') + self.pub_audio_info.publish(info_msg) if self.n_channel > 1: # The respeaker has 4 microphones. @@ -376,14 +384,15 @@ def __init__(self): self.pub_audio_merged_playback = rospy.Publisher( "audio_merged_playback", AudioData, queue_size=10) - info_raw_msg = AudioInfo( - channels=self.n_channel - 2, - sample_rate=self.respeaker_audio.rate, - sample_format='S16LE', - bitrate=(self.respeaker_audio.rate * - self.respeaker_audio.bitdepth), - coding_format='WAVE') - self.pub_audio_raw_info.publish(info_raw_msg) + if enable_audio_info is True: + info_raw_msg = AudioInfo( + channels=self.n_channel - 2, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=(self.respeaker_audio.rate * + self.respeaker_audio.bitdepth), + coding_format='WAVE') + self.pub_audio_raw_info.publish(info_raw_msg) self.speech_audio_raw_buffer = b"" self.speech_raw_prefetch_buffer = b"" diff --git a/respeaker_ros/scripts/speech_to_text.py b/respeaker_ros/scripts/speech_to_text.py index ea0e1c6fb..a3a481919 100755 --- a/respeaker_ros/scripts/speech_to_text.py +++ b/respeaker_ros/scripts/speech_to_text.py @@ -4,6 +4,8 @@ from __future__ import division +import sys + import actionlib import rospy try: @@ -14,7 +16,13 @@ import numpy as np from actionlib_msgs.msg import GoalStatus, GoalStatusArray from audio_common_msgs.msg import AudioData -from audio_common_msgs.msg import AudioInfo +enable_audio_info = True +try: + from audio_common_msgs.msg import AudioInfo +except Exception as e: + rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.' + ' AudioInfo message will not be published.') + enable_audio_info = False from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal from speech_recognition_msgs.msg import SpeechRecognitionCandidates @@ -24,6 +32,11 @@ def __init__(self): # format of input audio data audio_info_topic_name = rospy.get_param('~audio_info', '') if len(audio_info_topic_name) > 0: + if enable_audio_info is False: + rospy.logerr( + 'audio_common_msgs/AudioInfo message is not exists.' + ' Giving ~audio_info is not valid in your environment.') + sys.exit(1) rospy.loginfo('Extract audio info params from {}'.format( audio_info_topic_name)) audio_info_msg = rospy.wait_for_message( From 4b99d012c196e155c4f97b4180d449cd4bacda5c Mon Sep 17 00:00:00 2001 From: iory Date: Tue, 14 Jun 2022 12:43:59 +0900 Subject: [PATCH 18/18] [respeaker_ros] Fixed bytes calculation 'self.n_channe - 2' -> '(self.n_channel - 2)' --- respeaker_ros/scripts/respeaker_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index 8b5c16312..f8a4a6a00 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -399,7 +399,7 @@ def __init__(self): self.pub_speech_audio_raw = rospy.Publisher( "speech_audio_raw", AudioData, queue_size=10) self.speech_raw_prefetch_bytes = int( - self.n_channel - 2 + (self.n_channel - 2) * self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)