Merge pull request #267 from iory/texttospeech-cache

[google_cloud_texttospeech] Add cache option
jsk-ros-pkg · Sep 8, 2021 · 539e1c2 · 539e1c2
2 parents 674bb1b + 47872e8
commit 539e1c2
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 2 deletions.
diff --git a/3rdparty/google_cloud_texttospeech/README.md b/3rdparty/google_cloud_texttospeech/README.md
@@ -69,3 +69,10 @@ $ roseus
 (speak "欢迎来到 JSK" :lang "cmn-TW-Wavenet-A" :wait t)
 (speak "Willkommen bei JSK" :lang "de-DE-Wavenet-A" :wait t)
 ```
+
+### Tips
+
+By default, generated audio files are stored at `$HOME/.ros/google_cloud_texttospeech/cache`.
+If you want to change the cache directory, please set `GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_DIR` as an environment variable.
+
+If you don't want to cache the files, please set `GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED=false`.
diff --git a/3rdparty/google_cloud_texttospeech/bin/text2wave b/3rdparty/google_cloud_texttospeech/bin/text2wave
@@ -2,6 +2,10 @@
 
 import argparse
 from distutils.version import LooseVersion
+import hashlib
+import os
+import shutil
+import sys
 
 from google.cloud.texttospeech import TextToSpeechClient
 import pkg_resources
@@ -26,6 +30,11 @@ else:
     from google.cloud.texttospeech_v1.types import VoiceSelectionParams
 
 
+cache_enabled = os.environ.get(
+    'GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED', True)
+cache_enabled = cache_enabled is True \
+    or cache_enabled == 'true'  # for launch env tag.
+
 # You can see which language is available here
 # https://cloud.google.com/text-to-speech/docs/voices
 voice_name_to_language_code = {
@@ -352,6 +361,47 @@ def determine_voice_name(voice_name):
     return name, language_code
 
 
+def get_cache_dir():
+    """Return cache dir.
+
+    Returns
+    -------
+    cache_dir : str
+        cache directory.
+    """
+    ros_home = os.getenv('ROS_HOME', os.path.expanduser('~/.ros'))
+    pkg_ros_home = os.path.join(ros_home, 'google_cloud_texttospeech')
+    default_cache_dir = os.path.join(pkg_ros_home, 'cache')
+    cache_dir = os.environ.get(
+        'GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_DIR',
+        default_cache_dir)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    return cache_dir
+
+
+def checksum_md5(filename, blocksize=8192):
+    """Calculate md5sum.
+
+    Parameters
+    ----------
+    filename : str or pathlib.Path
+        input filename.
+    blocksize : int
+        MD5 has 128-byte digest blocks (default: 8192 is 128x64).
+    Returns
+    -------
+    md5 : str
+        calculated md5sum.
+    """
+    filename = str(filename)
+    hash_factory = hashlib.md5()
+    with open(filename, 'rb') as f:
+        for chunk in iter(lambda: f.read(blocksize), b''):
+            hash_factory.update(chunk)
+    return hash_factory.hexdigest()
+
+
 if __name__ == '__main__':
     speaking_rate = rospy.get_param('~speaking_rate', 1.0)
     parser = argparse.ArgumentParser(description='')
@@ -363,12 +413,26 @@ if __name__ == '__main__':
     # Instantiates a client
     client = TextToSpeechClient()
     with open(args.text, 'rb') as f:
-        synthesis_input = SynthesisInput(
-            text=f.readline())
+        speech_text = f.readline()
+    synthesis_input = SynthesisInput(
+        text=speech_text)
 
     name, language_code = determine_voice_name(
         args.evaluate.lstrip('(').rstrip(')'))
 
+    if cache_enabled:
+        cache_dir = get_cache_dir()
+        md5 = checksum_md5(args.text)
+        cache_filename = os.path.join(
+            cache_dir,
+            '--'.join([md5, language_code, name, str(speaking_rate)])
+            + '.mp3')
+        if os.path.exists(cache_filename):
+            print('[Text2Wave] Using cached sound file ({}) for {}'
+                  .format(cache_filename, speech_text.decode('utf-8')))
+            shutil.copy(cache_filename, args.output)
+            sys.exit(0)
+
     voice = VoiceSelectionParams(
         language_code=language_code,
         name=name,
@@ -389,3 +453,7 @@ if __name__ == '__main__':
     with open(args.output, 'wb') as out:
         # Write the response to the output file.
         out.write(response.audio_content)
+    if cache_enabled:
+        text_cache_filename = os.path.splitext(cache_filename)[0] + '.txt'
+        shutil.copy(args.text, text_cache_filename)
+        shutil.copy(args.output, cache_filename)
diff --git a/3rdparty/google_cloud_texttospeech/launch/google_cloud_texttospeech.launch b/3rdparty/google_cloud_texttospeech/launch/google_cloud_texttospeech.launch
@@ -16,6 +16,8 @@
        doc="English speaking speed (default: 100)" />
   <arg name="japanese_speed" default="100"
        doc="Japanese speaking speed (default: 100)" />
+  <arg name="cache" default="$(optenv GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED true)"
+       doc="Cache generated sound file (default: true)" />
 
   <node name="sound_play" if="$(arg use_english)"
         pkg="sound_play" type="soundplay_node.py"
@@ -27,6 +29,7 @@
     <env name="PATH" value="$(find google_cloud_texttospeech)/bin:$(env PATH)" />
     <env name="PYTHONIOENCODING" value="utf-8" />
     <env name="GOOGLE_APPLICATION_CREDENTIALS" value="$(arg credential)" />
+    <env name="GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED" value="$(arg cache)" />
     <rosparam subst_value="true">
       speaking_rate: $(arg english_speed)
     </rosparam>
@@ -42,6 +45,7 @@
     <env name="PATH" value="$(find google_cloud_texttospeech)/bin:$(env PATH)" />
     <env name="PYTHONIOENCODING" value="utf-8" />
     <env name="GOOGLE_APPLICATION_CREDENTIALS" value="$(arg credential)" />
+    <env name="GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED" value="$(arg cache)" />
     <rosparam subst_value="true">
       speaking_rate: $(arg japanese_speed)
     </rosparam>