From d41686502e3003b6472ad769115dfd710059a87d Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Mon, 8 Apr 2024 12:06:45 +0200 Subject: [PATCH] feat(xtts): support hindi for sentence-splitting and fine-tuning The XTTS model itself already supports Hindi, it was just in these components. --- TTS/demos/xtts_ft_demo/xtts_demo.py | 2 ++ TTS/tts/layers/xtts/tokenizer.py | 7 ++++++- docs/source/models/xtts.md | 23 +++++++++++++++++++---- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 85168c641d..7ac38ed6ee 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -192,6 +192,7 @@ def read_logs(): "hu", "ko", "ja", + "hi", ], ) progress_data = gr.Label(label="Progress:") @@ -370,6 +371,7 @@ def train_model( "hu", "ko", "ja", + "hi", ], ) tts_text = gr.Textbox( diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1a3cc47aaf..6cbd374f06 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -11,6 +11,7 @@ from spacy.lang.ar import Arabic from spacy.lang.en import English from spacy.lang.es import Spanish +from spacy.lang.hi import Hindi from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese from tokenizers import Tokenizer @@ -19,6 +20,7 @@ def get_spacy_lang(lang): + """Return Spacy language used for sentence splitting.""" if lang == "zh": return Chinese() elif lang == "ja": @@ -27,8 +29,10 @@ def get_spacy_lang(lang): return Arabic() elif lang == "es": return Spanish() + elif lang == "hi": + return Hindi() else: - # For most languages, Enlish does the job + # For most languages, English does the job return English() @@ -611,6 +615,7 @@ def __init__(self, vocab_file=None): "ja": 71, "hu": 224, "ko": 95, + "hi": 150, } @cached_property diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index de16674134..cc7c36b729 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -14,16 +14,31 @@ There is no need for an excessive amount of training data that spans countless h ### Updates with v2 - Improved voice cloning. - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime. -- 2 new languages: Hungarian and Korean. - Across the board quality improvements. ### Code Current implementation only supports inference and GPT encoder training. ### Languages -As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). - -Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. +XTTS-v2 supports 17 languages: + +- Arabic (ar) +- Chinese (zh-cn) +- Czech (cs) +- Dutch (nl) +- English (en) +- French (fr) +- German (de) +- Hindi (hi) +- Hungarian (hu) +- Italian (it) +- Japanese (ja) +- Korean (ko) +- Polish (pl) +- Portuguese (pt) +- Russian (ru) +- Spanish (es) +- Turkish (tr) ### License This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).