diff --git a/respeaker_ros/CMakeLists.txt b/respeaker_ros/CMakeLists.txt index 59c6034b0..f6b63469b 100644 --- a/respeaker_ros/CMakeLists.txt +++ b/respeaker_ros/CMakeLists.txt @@ -18,5 +18,7 @@ catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS} if(CATKIN_ENABLE_TESTING) find_package(rostest REQUIRED) + find_package(roslaunch REQUIRED) add_rostest(test/sample_respeaker.test) + roslaunch_add_file_check(launch/sample_respeaker.launch) endif() diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md index 1bba594fc..c2392c935 100644 --- a/respeaker_ros/README.md +++ b/respeaker_ros/README.md @@ -87,6 +87,151 @@ A ROS Package for Respeaker Mic Array a: 0.3" ``` +## Parameters for respeaker_node.py + + - ### Publishing topics + + - `audio` (`audio_common_msgs/AudioData`) + + Processed audio for ASR. 1 channel. + + - `audio_info` (`audio_common_msgs/AudioInfo`) + + Audio info with respect to `~audio`. + + - `audio_raw` (`audio_common_msgs/AudioData`) + + Micarray audio data has 4-channels. Maybe you need to update respeaker firmware. + + If the firmware isn't supported, this will not be output. + + - `audio_info_raw` (`audio_common_msgs/AudioInfo`) + + Audio info with respect to `~audio_raw`. + + If the firmware isn't supported, this will not be output. + + - `speech_audio` (`audio_common_msgs/AudioData`) + + Audio data while a person is speaking using the VAD function. + + - `speech_audio_raw` (`audio_common_msgs/AudioData`) + + Audio data has 4-channels while a person is speaking using the VAD function. + + If the firmware isn't supported, this will not be output. + + - `audio_merged_playback` (`audio_common_msgs/AudioData`) + + Data that combines the sound of mic and speaker. + + If the firmware isn't supported, this will not be output. + + For more detail, please see https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/ + + - `~is_speeching` (`std_msgs/Bool`) + + Using VAD function, publish whether someone is speaking. + + - `~sound_direction` (`std_msgs/Int32`) + + Direction of sound. + + - `~sound_localization` (`geometry_msgs/PoseStamped`) + + Localized Sound Direction. The value of the position in the estimated direction with `~doa_offset` as the radius is obtained. + + - ### Parameters + + - `~update_rate` (`Double`, default: `10.0`) + + Publishing info data such as `~is_speeching`, `~sound_direction`, `~sound_localization`, `~speech_audio` and `~speech_audio_raw`. + + - `~sensor_frame_id` (`String`, default: `respeaker_base`) + + Frame id. + + - `~doa_xy_offset` (`Double`, default: `0.0`) + + `~doa_offset` is a estimated sound direction's radius. + + - `~doa_yaw_offset` (`Double`, default: `90.0`) + + Estimated DoA angle offset. + + - `~speech_prefetch` (`Double`, default: `0.5`) + + Time to represent how long speech is pre-stored in buffer. + + - `~speech_continuation` (`Double`, default: `0.5`) + + If the time between the current time and the time when the speech is stopped is shorter than this time, + it is assumed that someone is speaking. + + - `~speech_max_duration` (`Double`, default: `7.0`) + + - `~speech_min_duration` (`Double`, default: `0.1`) + + If the speaking interval is within these times, `~speech_audio` and `~speech_audio_raw` will be published. + + - `~suppress_pyaudio_error` (`Bool`, default: `True`) + + If this value is `True`, suppress error from pyaudio. + +## Parameters for speech_to_text.py + + - ### Publishing topics + + - `~speech_to_text` (`speech_recognition_msgs/SpeechRecognitionCandidates`) + + Recognized text. + + - ### Subscribing topics + + - `audio` (`audio_common_msgs/AudioData`) + + Input audio. + + - ### Parameters + + - `~audio_info` (`String`, default: ``) + + audio_info (`audio_common_msgs/AudioInfo`) topic. If this value is specified, `~sample_rate`, `~sample_width` and `~channels` parameters are obtained from the topic. + + - `~sample_rate` (`Int`, default: `16000`) + + Sampling rate. + + - `~sample_width` (`Int`, default: `2`) + + Sample with. + + - `~channels` (`Int`, default: `1`) + + Number of channels. + + - `~target_channel` (`Int`, default: `0`) + + Target number of channel. + + - `~language` (`String`, default: `ja-JP`) + + language of speech to text service. For English users, you can specify `en-US`. + + - `~self_cancellation` (`Bool`, default: `True`) + + ignore voice input while the robot is speaking. + + - `~tts_tolerance` (`String`, default: `1.0`) + + time to assume as SPEAKING after tts service is finished. + + - `~tts_action_names` (`List[String]`, default: `['sound_play']`) + + If `~self_chancellation` is `True`, this value will be used. + + When the actions are active, do nothing with the callback that subscribes to `audio`. + ## Use cases ### Voice Recognition diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch index 31d083608..e2c43c557 100644 --- a/respeaker_ros/launch/sample_respeaker.launch +++ b/respeaker_ros/launch/sample_respeaker.launch @@ -13,6 +13,8 @@ + + + respawn="true" respawn_delay="10" > + @@ -30,6 +33,7 @@ + audio_info: $(arg audio_info) language: $(arg language) self_cancellation: $(arg self_cancellation) tts_tolerance: 0.5 diff --git a/respeaker_ros/package.xml b/respeaker_ros/package.xml index a8d193b72..41a08fa4f 100644 --- a/respeaker_ros/package.xml +++ b/respeaker_ros/package.xml @@ -15,6 +15,7 @@ flac geometry_msgs std_msgs + sound_play speech_recognition_msgs tf python-numpy diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py index f95f60866..f8a4a6a00 100755 --- a/respeaker_ros/scripts/respeaker_node.py +++ b/respeaker_ros/scripts/respeaker_node.py @@ -16,6 +16,13 @@ import sys import time from audio_common_msgs.msg import AudioData +enable_audio_info = True +try: + from audio_common_msgs.msg import AudioInfo +except Exception as e: + rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.' + ' AudioInfo message will not be published.') + enable_audio_info = False from geometry_msgs.msg import PoseStamped from std_msgs.msg import Bool, Int32, ColorRGBA from dynamic_reconfigure.server import Server @@ -254,7 +261,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True): if self.channels != 6: rospy.logwarn("%d channel is found for respeaker" % self.channels) rospy.logwarn("You may have to update firmware.") - self.channel = min(self.channels - 1, max(0, self.channel)) self.stream = self.pyaudio.open( input=True, start=False, @@ -284,9 +290,8 @@ def stream_callback(self, in_data, frame_count, time_info, status): data = np.frombuffer(in_data, dtype=np.int16) chunk_per_channel = int(len(data) / self.channels) data = np.reshape(data, (chunk_per_channel, self.channels)) - chan_data = data[:, self.channel] # invoke callback - self.on_audio(chan_data.tobytes()) + self.on_audio(data) return None, pyaudio.paContinue def start(self): @@ -322,14 +327,24 @@ def __init__(self): self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True) self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True) self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10) + if enable_audio_info is True: + self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo, + queue_size=1, latch=True) + self.pub_audio_raw_info = rospy.Publisher("audio_info_raw", AudioInfo, + queue_size=1, latch=True) self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10) # init config self.config = None self.dyn_srv = Server(RespeakerConfig, self.on_config) # start self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error) + self.n_channel = self.respeaker_audio.channels + self.speech_prefetch_bytes = int( - self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0) + 1 + * self.speech_prefetch + * self.respeaker_audio.rate + * self.respeaker_audio.bitdepth / 8.0) self.speech_prefetch_buffer = b"" self.respeaker_audio.start() self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate), @@ -337,6 +352,58 @@ def __init__(self): self.timer_led = None self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led) + # processed audio for ASR + if enable_audio_info is True: + info_msg = AudioInfo( + channels=1, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth, + coding_format='WAVE') + self.pub_audio_info.publish(info_msg) + + if self.n_channel > 1: + # The respeaker has 4 microphones. + # Multiple microphones can be used for + # beam forming (strengthening the sound in a specific direction) + # and sound localization (the respeaker outputs the azimuth + # direction, but the multichannel can estimate + # the elevation direction). etc. + + # Channel 0: processed audio for ASR + # Channel 1: mic1 raw data + # Channel 2: mic2 raw data + # Channel 3: mic3 raw data + # Channel 4: mic4 raw data + # Channel 5: merged playback + # For more detail, please see + # https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/ + # (self.n_channel - 2) = 4 channels are multiple microphones. + self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData, + queue_size=10) + self.pub_audio_merged_playback = rospy.Publisher( + "audio_merged_playback", AudioData, + queue_size=10) + if enable_audio_info is True: + info_raw_msg = AudioInfo( + channels=self.n_channel - 2, + sample_rate=self.respeaker_audio.rate, + sample_format='S16LE', + bitrate=(self.respeaker_audio.rate * + self.respeaker_audio.bitdepth), + coding_format='WAVE') + self.pub_audio_raw_info.publish(info_raw_msg) + + self.speech_audio_raw_buffer = b"" + self.speech_raw_prefetch_buffer = b"" + self.pub_speech_audio_raw = rospy.Publisher( + "speech_audio_raw", AudioData, queue_size=10) + self.speech_raw_prefetch_bytes = int( + (self.n_channel - 2) + * self.speech_prefetch + * self.respeaker_audio.rate + * self.respeaker_audio.bitdepth / 8.0) + def on_shutdown(self): try: self.respeaker.close() @@ -374,14 +441,30 @@ def on_status_led(self, msg): oneshot=True) def on_audio(self, data): - self.pub_audio.publish(AudioData(data=data)) + # take processed audio for ASR. + processed_data = data[:, 0].tobytes() + self.pub_audio.publish(AudioData(data=processed_data)) + if self.n_channel > 1: + raw_audio_data = data[:, 1:5].reshape(-1).tobytes() + self.pub_audio_raw.publish( + AudioData(data=raw_audio_data)) + self.pub_audio_merged_playback.publish( + AudioData(data=data[:, 5].tobytes())) if self.is_speeching: if len(self.speech_audio_buffer) == 0: self.speech_audio_buffer = self.speech_prefetch_buffer - self.speech_audio_buffer += data + if self.n_channel > 1: + self.speech_audio_raw_buffer = self.speech_raw_prefetch_buffer + self.speech_audio_buffer += processed_data + if self.n_channel > 1: + self.speech_audio_raw_buffer += raw_audio_data else: - self.speech_prefetch_buffer += data + self.speech_prefetch_buffer += processed_data self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:] + if self.n_channel > 1: + self.speech_raw_prefetch_buffer += raw_audio_data + self.speech_raw_prefetch_buffer = self.speech_raw_prefetch_buffer[ + -self.speech_raw_prefetch_bytes:] def on_timer(self, event): stamp = event.current_real or rospy.Time.now() @@ -421,13 +504,15 @@ def on_timer(self, event): elif self.is_speeching: buf = self.speech_audio_buffer self.speech_audio_buffer = b"" + buf_raw = self.speech_audio_raw_buffer + self.speech_audio_raw_buffer = b"" self.is_speeching = False duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth - duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth + duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel rospy.loginfo("Speech detected for %.3f seconds" % duration) if self.speech_min_duration <= duration < self.speech_max_duration: - self.pub_speech_audio.publish(AudioData(data=buf)) + self.pub_speech_audio_raw.publish(AudioData(data=buf_raw)) if __name__ == '__main__': diff --git a/respeaker_ros/scripts/speech_to_text.py b/respeaker_ros/scripts/speech_to_text.py index 439652ba8..a3a481919 100755 --- a/respeaker_ros/scripts/speech_to_text.py +++ b/respeaker_ros/scripts/speech_to_text.py @@ -2,6 +2,10 @@ # -*- coding: utf-8 -*- # Author: Yuki Furuta +from __future__ import division + +import sys + import actionlib import rospy try: @@ -9,8 +13,16 @@ except ImportError as e: raise ImportError(str(e) + '\nplease try "pip install speechrecognition"') +import numpy as np from actionlib_msgs.msg import GoalStatus, GoalStatusArray from audio_common_msgs.msg import AudioData +enable_audio_info = True +try: + from audio_common_msgs.msg import AudioInfo +except Exception as e: + rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.' + ' AudioInfo message will not be published.') + enable_audio_info = False from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal from speech_recognition_msgs.msg import SpeechRecognitionCandidates @@ -18,8 +30,32 @@ class SpeechToText(object): def __init__(self): # format of input audio data - self.sample_rate = rospy.get_param("~sample_rate", 16000) - self.sample_width = rospy.get_param("~sample_width", 2) + audio_info_topic_name = rospy.get_param('~audio_info', '') + if len(audio_info_topic_name) > 0: + if enable_audio_info is False: + rospy.logerr( + 'audio_common_msgs/AudioInfo message is not exists.' + ' Giving ~audio_info is not valid in your environment.') + sys.exit(1) + rospy.loginfo('Extract audio info params from {}'.format( + audio_info_topic_name)) + audio_info_msg = rospy.wait_for_message( + audio_info_topic_name, AudioInfo) + self.sample_rate = audio_info_msg.sample_rate + self.sample_width = audio_info_msg.bitrate // self.sample_rate // 8 + self.channels = audio_info_msg.channels + else: + self.sample_rate = rospy.get_param("~sample_rate", 16000) + self.sample_width = rospy.get_param("~sample_width", 2) + self.channels = rospy.get_param("~channels", 1) + if self.sample_width == 2: + self.dtype = 'int16' + elif self.sample_width == 4: + self.dtype = 'int32' + else: + raise NotImplementedError('sample_width {} is not supported' + .format(self.sample_width)) + self.target_channel = rospy.get_param("~target_channel", 0) # language of STT service self.language = rospy.get_param("~language", "ja-JP") # ignore voice input while the robot is speaking @@ -78,7 +114,11 @@ def audio_cb(self, msg): if self.is_canceling: rospy.loginfo("Speech is cancelled") return - data = SR.AudioData(msg.data, self.sample_rate, self.sample_width) + + data = SR.AudioData( + np.frombuffer(msg.data, dtype=self.dtype)[ + self.target_channel::self.channels].tobytes(), + self.sample_rate, self.sample_width) try: rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_google( diff --git a/respeaker_ros/test/sample_respeaker.test b/respeaker_ros/test/sample_respeaker.test index a1ad1d143..3a9cb2c79 100644 --- a/respeaker_ros/test/sample_respeaker.test +++ b/respeaker_ros/test/sample_respeaker.test @@ -3,6 +3,7 @@ +