idiap · eginhard · May 29, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/README.md b/README.md
@@ -154,7 +154,7 @@ The following extras allow the installation of optional dependencies:
 |------|-------------|
 | `all` | All optional dependencies, except `dev` and `docs` |
 | `dev` | Development dependencies |
-| `dev` | Dependencies for building the documentation |
+| `docs` | Dependencies for building the documentation |
 | `notebooks` | Dependencies only used in notebooks |
 | `server` | Dependencies to run the TTS server |
 | `bn` | Bangla G2P |
@@ -270,11 +270,10 @@ You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tt
 and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
 
 ```python
-# TTS with on the fly voice conversion
+# TTS with fairseq models
 api = TTS("tts_models/deu/fairseq/vits")
-api.tts_with_vc_to_file(
+api.tts_to_file(
     "Wie sage ich auf Italienisch, dass ich dich liebe?",
-    speaker_wav="target/speaker.wav",
     file_path="output.wav"
 )
 ```

diff --git a/TTS/__init__.py b/TTS/__init__.py
@@ -0,0 +1,3 @@
+import importlib.metadata
+
+__version__ = importlib.metadata.version("coqui-tts")
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -3,6 +3,8 @@
 import logging
 import re
 import subprocess
+import tempfile
+from pathlib import Path
 from typing import Optional
 
 from packaging.version import Version
@@ -50,7 +52,7 @@ def get_espeakng_version() -> str:
     _DEF_ESPEAK_VER = None
 
 
-def _espeak_exe(espeak_lib: str, args: list, *, sync: bool = False) -> list[bytes]:
+def _espeak_exe(espeak_lib: str, args: list) -> list[str]:
     """Run espeak with the given arguments."""
     cmd = [
         espeak_lib,
@@ -59,32 +61,18 @@ def _espeak_exe(espeak_lib: str, args: list, *, sync: bool = False) -> list[byte
         "1",  # UTF8 text encoding
     ]
     cmd.extend(args)
-    logger.debug("espeakng: executing %s", repr(cmd))
-
-    with subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    ) as p:
-        res = iter(p.stdout.readline, b"")
-        err = iter(p.stderr.readline, b"")
-        for line in err:
-            logger.warning("espeakng: %s", line.decode("utf-8").strip())
-        if not sync:
-            p.stdout.close()
-            if p.stderr:
-                p.stderr.close()
-            if p.stdin:
-                p.stdin.close()
-            return res
-        res2 = list(res)
-        p.stdout.close()
-        if p.stderr:
-            p.stderr.close()
-        if p.stdin:
-            p.stdin.close()
-        p.wait()
-    return res2
+    logger.debug("Executing: %s", repr(cmd))
+
+    p = subprocess.run(cmd, capture_output=True, encoding="utf8", check=True)
+    for line in p.stderr.strip().split("\n"):
+        if line.strip() != "":
+            logger.warning("%s: %s", espeak_lib, line.strip())
+    res = []
+    for line in p.stdout.strip().split("\n"):
+        if line.strip() != "":
+            logger.debug("%s: %s", espeak_lib, line.strip())
+            res.append(line.strip())
+    return res
 
 
 class ESpeak(BasePhonemizer):
@@ -198,12 +186,15 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False
         if tie:
             args.append("--tie=%s" % tie)
 
-        args.append(text)
+        tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8")
+        tmp.write(text)
+        tmp.close()
+        args.append("-f")
+        args.append(tmp.name)
+
         # compute phonemes
         phonemes = ""
-        for line in _espeak_exe(self.backend, args, sync=True):
-            logger.debug("line: %s", repr(line))
-            ph_decoded = line.decode("utf8").strip()
+        for line in _espeak_exe(self.backend, args):
             # espeak:
             #   version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
             # espeak-ng:
@@ -213,9 +204,10 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False
             #   "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
             # phonemize needs to remove the language flags of the returned text:
             #   "sɛʁtˈɛ̃ mˈo kɔm fˈʊtbɔːl ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
-            ph_decoded = re.sub(r"\(.+?\)", "", ph_decoded)
+            ph_decoded = re.sub(r"\(.+?\)", "", line)
 
             phonemes += ph_decoded.strip()
+        Path(tmp.name).unlink()
         return phonemes.replace("_", separator)
 
     def _phonemize(self, text: str, separator: str = "") -> str:
@@ -232,14 +224,12 @@ def supported_languages() -> dict[str, str]:
             return {}
         args = ["--voices"]
         langs = {}
-        for count, line in enumerate(_espeak_exe(_DEF_ESPEAK_LIB, args, sync=True)):
-            line = line.decode("utf8").strip()
+        for count, line in enumerate(_espeak_exe(_DEF_ESPEAK_LIB, args)):
             if count > 0:
                 cols = line.split()
                 lang_code = cols[1]
                 lang_name = cols[3]
                 langs[lang_code] = lang_name
-            logger.debug("line: %s", repr(line))
         return langs
 
     def version(self) -> str:

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,7 +20,7 @@
 autodoc_mock_imports = ["soundfile"]
 
 # -- Project information -----------------------------------------------------
-project = "TTS"
+project = "coqui-tts"
 copyright = "2021 Coqui GmbH, 2020 TTS authors"
 author = "Coqui GmbH"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ include = ["TTS*"]
 
 [project]
 name = "coqui-tts"
-version = "0.24.0"
+version = "0.24.1"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
 requires-python = ">=3.9, <3.13"
@@ -69,7 +69,7 @@ dependencies = [
     "gruut[de,es,fr]==2.2.3",
     # Tortoise
     "einops>=0.6.0",
-    "transformers>=4.33.0",
+    "transformers>=4.33.0,<4.41.0",
     # Bark
     "encodec>=0.1.1",
     # XTTS

diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
@@ -116,6 +116,12 @@ def setUp(self):
         output = self.phonemizer.phonemize(text, separator="")
         self.assertEqual(output, gt)
 
+        # UTF8 characters
+        text = "źrebię"
+        gt = "ʑrˈɛbjɛ"
+        output = ESpeak("pl").phonemize(text, separator="")
+        self.assertEqual(output, gt)
+
     def test_name(self):
         self.assertEqual(self.phonemizer.name(), "espeak")