Skip to content

Commit

Permalink
Merge pull request #267 from iory/texttospeech-cache
Browse files Browse the repository at this point in the history
[google_cloud_texttospeech] Add cache option
  • Loading branch information
k-okada committed Sep 8, 2021
2 parents 674bb1b + 47872e8 commit 539e1c2
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 2 deletions.
7 changes: 7 additions & 0 deletions 3rdparty/google_cloud_texttospeech/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,10 @@ $ roseus
(speak "欢迎来到 JSK" :lang "cmn-TW-Wavenet-A" :wait t)
(speak "Willkommen bei JSK" :lang "de-DE-Wavenet-A" :wait t)
```

### Tips

By default, generated audio files are stored at `$HOME/.ros/google_cloud_texttospeech/cache`.
If you want to change the cache directory, please set `GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_DIR` as an environment variable.

If you don't want to cache the files, please set `GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED=false`.
72 changes: 70 additions & 2 deletions 3rdparty/google_cloud_texttospeech/bin/text2wave
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import argparse
from distutils.version import LooseVersion
import hashlib
import os
import shutil
import sys

from google.cloud.texttospeech import TextToSpeechClient
import pkg_resources
Expand All @@ -26,6 +30,11 @@ else:
from google.cloud.texttospeech_v1.types import VoiceSelectionParams


cache_enabled = os.environ.get(
'GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED', True)
cache_enabled = cache_enabled is True \
or cache_enabled == 'true' # for launch env tag.

# You can see which language is available here
# https://cloud.google.com/text-to-speech/docs/voices
voice_name_to_language_code = {
Expand Down Expand Up @@ -352,6 +361,47 @@ def determine_voice_name(voice_name):
return name, language_code


def get_cache_dir():
"""Return cache dir.
Returns
-------
cache_dir : str
cache directory.
"""
ros_home = os.getenv('ROS_HOME', os.path.expanduser('~/.ros'))
pkg_ros_home = os.path.join(ros_home, 'google_cloud_texttospeech')
default_cache_dir = os.path.join(pkg_ros_home, 'cache')
cache_dir = os.environ.get(
'GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_DIR',
default_cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
return cache_dir


def checksum_md5(filename, blocksize=8192):
"""Calculate md5sum.
Parameters
----------
filename : str or pathlib.Path
input filename.
blocksize : int
MD5 has 128-byte digest blocks (default: 8192 is 128x64).
Returns
-------
md5 : str
calculated md5sum.
"""
filename = str(filename)
hash_factory = hashlib.md5()
with open(filename, 'rb') as f:
for chunk in iter(lambda: f.read(blocksize), b''):
hash_factory.update(chunk)
return hash_factory.hexdigest()


if __name__ == '__main__':
speaking_rate = rospy.get_param('~speaking_rate', 1.0)
parser = argparse.ArgumentParser(description='')
Expand All @@ -363,12 +413,26 @@ if __name__ == '__main__':
# Instantiates a client
client = TextToSpeechClient()
with open(args.text, 'rb') as f:
synthesis_input = SynthesisInput(
text=f.readline())
speech_text = f.readline()
synthesis_input = SynthesisInput(
text=speech_text)

name, language_code = determine_voice_name(
args.evaluate.lstrip('(').rstrip(')'))

if cache_enabled:
cache_dir = get_cache_dir()
md5 = checksum_md5(args.text)
cache_filename = os.path.join(
cache_dir,
'--'.join([md5, language_code, name, str(speaking_rate)])
+ '.mp3')
if os.path.exists(cache_filename):
print('[Text2Wave] Using cached sound file ({}) for {}'
.format(cache_filename, speech_text.decode('utf-8')))
shutil.copy(cache_filename, args.output)
sys.exit(0)

voice = VoiceSelectionParams(
language_code=language_code,
name=name,
Expand All @@ -389,3 +453,7 @@ if __name__ == '__main__':
with open(args.output, 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)
if cache_enabled:
text_cache_filename = os.path.splitext(cache_filename)[0] + '.txt'
shutil.copy(args.text, text_cache_filename)
shutil.copy(args.output, cache_filename)
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
doc="English speaking speed (default: 100)" />
<arg name="japanese_speed" default="100"
doc="Japanese speaking speed (default: 100)" />
<arg name="cache" default="$(optenv GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED true)"
doc="Cache generated sound file (default: true)" />

<node name="sound_play" if="$(arg use_english)"
pkg="sound_play" type="soundplay_node.py"
Expand All @@ -27,6 +29,7 @@
<env name="PATH" value="$(find google_cloud_texttospeech)/bin:$(env PATH)" />
<env name="PYTHONIOENCODING" value="utf-8" />
<env name="GOOGLE_APPLICATION_CREDENTIALS" value="$(arg credential)" />
<env name="GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED" value="$(arg cache)" />
<rosparam subst_value="true">
speaking_rate: $(arg english_speed)
</rosparam>
Expand All @@ -42,6 +45,7 @@
<env name="PATH" value="$(find google_cloud_texttospeech)/bin:$(env PATH)" />
<env name="PYTHONIOENCODING" value="utf-8" />
<env name="GOOGLE_APPLICATION_CREDENTIALS" value="$(arg credential)" />
<env name="GOOGLE_CLOUD_TEXTTOSPEECH_CACHE_ENABLED" value="$(arg cache)" />
<rosparam subst_value="true">
speaking_rate: $(arg japanese_speed)
</rosparam>
Expand Down

0 comments on commit 539e1c2

Please sign in to comment.