Add speech

ilesinge · Oct 16, 2022 · 29faf7a · 29faf7a
1 parent e9eb0e1
commit 29faf7a
Show file tree

Hide file tree

Showing 7 changed files with 404 additions and 66 deletions.
diff --git a/Pipfile b/Pipfile
@@ -11,6 +11,7 @@ requests = "*"
 "flask[async]" = "*"
 gunicorn = "*"
 typer = {extras = ["all"], version = "*"}
+google-cloud-texttospeech = "*"
 
 [dev-packages]
 pylint = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/shabda/dj.py b/shabda/dj.py
@@ -14,8 +14,9 @@
 from termcolor import colored
 from shabda.display import print_error
 from shabda.client import Client
-from shabda.sampleset import SampleSet
+from shabda.sampleset import FREESOUND, SampleSet, TTS
 from shabda.sound import Sound
+from google.cloud import texttospeech
 
 
 class Dj:
@@ -33,7 +34,7 @@ def parse_definition(self, definition):
         for section in sections:
             parts = section.split(":")
             rawword = parts[0]
-            word = "".join(ch for ch in rawword if ch.isalnum())
+            word = "".join(ch for ch in rawword if ch.isalnum() or ch == "_")
             if len(word) == 0:
                 raise ValueError("A sample name is required")
             number = None
@@ -51,10 +52,73 @@ def parse_definition(self, definition):
             words[word] = number
         return words
 
-    def list(self, word, max_number=None, licenses=None):
+    def list(
+        self,
+        word,
+        max_number=None,
+        licenses=None,
+        gender=None,
+        language=None,
+        soundtype=None,
+    ):
         """List files for a sample name"""
-        sampleset = SampleSet(word)
-        return sampleset.list(max_number, licenses=licenses)
+        if soundtype == "tts":
+            stype = TTS
+        else:
+            stype = FREESOUND
+        sampleset = SampleSet(word, stype)
+        return sampleset.list(
+            max_number, licenses=licenses, gender=gender, language=language
+        )
+
+    async def speak(self, word, language, gender):
+        """Speak a word"""
+        sampleset = SampleSet(word, TTS)
+        existing_samples = sampleset.list()
+        if len(existing_samples) > 0:
+            return True
+        word_dir = sampleset.dir()
+        client = texttospeech.TextToSpeechClient()
+        synthesis_input = texttospeech.SynthesisInput(text=word.replace("_", " "))
+        # mini hack
+        if language == "en-GB" and gender == "f":
+            voice = texttospeech.VoiceSelectionParams(
+                name="en-GB-Neural2-A",
+                language_code="en-GB",
+                ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
+            )
+            # speaking_rate=0.85
+            # pitch=-4
+        else:
+            if gender == "m":
+                ssml_gender = texttospeech.SsmlVoiceGender.MALE
+            else:
+                ssml_gender = texttospeech.SsmlVoiceGender.FEMALE
+            voice = texttospeech.VoiceSelectionParams(
+                language_code=language,
+                ssml_gender=ssml_gender,
+            )
+        audio_config = texttospeech.AudioConfig(
+            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
+            # speaking_rate=0.85,
+            # pitch=-4,
+        )
+        response = client.synthesize_speech(
+            input=synthesis_input, voice=voice, audio_config=audio_config
+        )
+        filepath = word_dir + "/" + word + "_0.wav"
+        with open(filepath, "wb") as out:
+            out.write(response.audio_content)
+        sound = Sound(
+            speechsound={
+                "gender": gender,
+                "language": language,
+                "file": filepath,
+            }
+        )
+        sampleset.add(sound)
+        sampleset.saveconfig()
+        return True
 
     async def fetch(self, word, num, licenses):
         """Fetch a collection of samples"""

diff --git a/shabda/sampleset.py b/shabda/sampleset.py
@@ -5,17 +5,22 @@
 from glob import glob
 from shabda.sound import Sound
 
+FREESOUND = 1
+TTS = 2
+
 
 class SampleSet:
     """A set of sample files"""
 
     word = None
     master_id = None
     sounds = []
+    type = FREESOUND
 
-    def __init__(self, word):
+    def __init__(self, word, soundtype=FREESOUND):
         """Initialize the sample set"""
         self.word = word
+        self.type = soundtype
         directory = self.dir()
         if not os.path.exists(directory):
             os.makedirs(directory)
@@ -31,15 +36,22 @@ def __init__(self, word):
 
     def dir(self):
         """Return the directory for this sample set"""
-        return "samples/" + self.word
+        directory = "samples/" + self.word
+        if self.type == TTS:
+            directory = "speech_" + directory
+        return directory
 
-    def list(self, max_number=None, licenses=None):
+    def list(self, max_number=None, licenses=None, gender=None, language=None):
         """List sounds for a sample name"""
         # accept None as a max_number
 
         sounds = []
         for sound in self.sounds:
-            if licenses is None or sound["license"] in licenses:
+            if (
+                (licenses is None or sound["license"] in licenses)
+                and (gender is None or sound["gender"] == gender)
+                and (language is None or sound["language"] == language)
+            ):
                 sounds.append(Sound(configsound=sound))
         if max_number is not None:
             sounds = sounds[0:max_number]
@@ -55,6 +67,8 @@ def add(self, sound):
                 "username": sound.username,
                 "license": sound.licensename,
                 "file": sound.file,
+                "gender": sound.gender,
+                "language": sound.language,
             }
         )
 

diff --git a/shabda/sound.py b/shabda/sound.py
@@ -9,8 +9,10 @@ class Sound:
     url = None
     licensename = None
     file = None
+    language = None
+    gender = None
 
-    def __init__(self, freesound=None, configsound=None):
+    def __init__(self, freesound=None, configsound=None, speechsound=None):
         if freesound is not None:
             self.id = freesound.id
             self.username = freesound.username
@@ -22,6 +24,10 @@ def __init__(self, freesound=None, configsound=None):
             self.url = configsound["url"]
             self.licensename = configsound["license"]
             self.file = configsound["file"]
+        if speechsound is not None:
+            self.language = speechsound["language"]
+            self.gender = speechsound["gender"]
+            self.file = speechsound["file"]
 
     def _translate_license(self, licenseurl):
         """Translate a license URL into a license  name"""

diff --git a/shabda/web.py b/shabda/web.py
@@ -124,8 +124,76 @@ def remove_file(response):
     return send_file(tmpfile, as_attachment=True)
 
 
-@bp.route("/samples/<path:path>")
+@bp.route("/speech/<definition>")
+async def speech(definition):
+    """Download a spoken word"""
+    gender = request.args.get("gender", "f")
+    language = request.args.get("language", "en-GB")
+
+    definition = definition.replace(" ", "_")
+    try:
+        words = dj.parse_definition(definition)
+    except ValueError as ex:
+        raise BadRequest(ex) from ex
+    tasks = []
+    for word in words:
+        tasks.append(speak_one(word, language, gender))
+    results = await asyncio.gather(*tasks)
+    global_status = "empty"
+    for status in results:
+        if status is True:
+            global_status = "ok"
+
+    return jsonify(
+        {
+            "status": global_status,
+            "definition": clean_definition(words),
+        }
+    )
+
+
+@bp.route("/speech/<definition>.json")
+async def speech_json(definition):
+    """Download a reslist definition"""
+    gender = request.args.get("gender", "f")
+    language = request.args.get("language", "en-GB")
+    definition = definition.replace(" ", "_")
+
+    await speech(definition)
+
+    url = urlparse(request.base_url)
+    base = url.scheme + "://" + url.hostname
+    if url.port:
+        base += ":" + str(url.port)
+    try:
+        words = dj.parse_definition(definition)
+    except ValueError as ex:
+        raise BadRequest(ex) from ex
+    reslist = []
+    for word in words:
+        samples = dj.list(word, gender=gender, language=language, soundtype="tts")
+        sample_num = 0
+        for sound in samples:
+            sound_data = {
+                "url": sound.file,
+                "type": "audio",
+                "bank": word,
+                "n": sample_num,
+            }
+            reslist.append(sound_data)
+            sample_num += 1
+
+    return jsonify(reslist)
+
+
+@bp.route("speech/speech_samples/<path:path>")
 def serve_sample(path):
+    """Serve a sample"""
+    return send_from_directory("../speech_samples/", path, as_attachment=False)
+
+
+@bp.route("/samples/<path:path>")
+def serve_speech_sample(path):
     """Serve a sample"""
     return send_from_directory("../samples/", path, as_attachment=False)
 
@@ -160,6 +228,11 @@ def cors_after(response):
     return response
 
 
+async def speak_one(word, language, gender):
+    """Speak a word"""
+    return await dj.speak(word, language, gender)
+
+
 async def fetch_one(word, number, licenses):
     """Fetch a single sample set"""
     return await dj.fetch(word, number, licenses)

diff --git a/speech_samples/.gitignore b/speech_samples/.gitignore
@@ -0,0 +1 @@
+*/