add general voice-cloning finetuning code (#281)

* add general voice-cloning code * add copyright * Refine the readme Signed-off-by: hshen14 <haihao.shen@intel.com> * Refine the readme Signed-off-by: hshen14 <haihao.shen@intel.com> * Refine the readme Signed-off-by: hshen14 <haihao.shen@intel.com> * enhance finetuning Signed-off-by: Spycsh <sihan.chen@intel.com> * refine README --------- Signed-off-by: hshen14 <haihao.shen@intel.com> Signed-off-by: Spycsh <sihan.chen@intel.com> Co-authored-by: hshen14 <haihao.shen@intel.com>
intel · Sep 11, 2023 · 1dac9c6 · 1dac9c6
1 parent ba29b19
commit 1dac9c6
Show file tree

Hide file tree

Showing 16 changed files with 277 additions and 33 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
@@ -349,11 +349,6 @@ class FinetuningArguments:
 class TTSDatasetArguments:
     audio_folder_path: Optional[str] = field(default=None, metadata={"help": "The path to the directory of audios."})
     text_folder_path: Optional[str] = field(default=None, metadata={"help": "The path to the directory of texts."})
-    gender: Optional[str] = field(default=None, metadata={"help": "Gender."})
-    language: Optional[str] = field(default="en", metadata={"help": "Language. \
-                                                            Shoule be one of 'en', 'de', 'fr', 'es', 'pl', 'it', 'ro' \
-                                                            'hu', 'cs', 'nl', 'fi', 'hr', 'sk', 'sl', 'et', 'lt', \
-                                                            'en_accented'"})
 
 @dataclass
 class TTSModelArguments:

diff --git a/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/README.md b/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/README.md
@@ -0,0 +1,55 @@
+# Voice Cloning by finetuning a Text-To-Speech (TTS) model
+
+This example shows you how to clone an arbitrary person's voice by finetuning SpeechT5.
+
+## Prepare data
+
+Under this `tts_finetuning` example directory, please make sure there exist two directories: `audios/` and `texts/`, 
+where the former contains all the audio files and the latter contains all the texts corresponding to each audio file.
+
+The audio and text file names should be formatted like following (We just need to make sure every audio file has the same name with its text file):
+
+```
+audios/
+    <audio_name_0>.mp3
+    <audio_name_1>.mp3
+    <audio_name_2>.mp3
+    ...
+
+texts/
+    <audio_name_0>.txt
+    <audio_name_1>.txt
+    <audio_name_2>.txt
+    ...
+```
+
+
+Users can use their own audios and corresponding texts, or they can download from the Internet. Here are the [audio samples](https://github.com/audio-samples/audio-samples.github.io/tree/master/samples/mp3/ted_speakers/FeiFeiLi) that we use in this example.
+
+Then, we can prepare the texts of those audio files by just listening and writing the texts manually, or running one audio-speech-recognition (ASR) model by using Intel Extension For Transformers ASR interface:
+
+```python
+# Replace <xxxxx_sample-0> with your input audio name
+python asr.py -i audios/<xxxxx_sample-0>.mp3 -m openai/whisper-tiny
+```
+
+For simplicity in this example, we have already generated the texts of the aforementioned audio samples under the `texts/` folder.
+
+## Finetuning
+
+After preparing the dataset, we can start finetuning. We can just run the following command and the finetuned model is by default named `finetuned_model.pt`.
+
+```
+python finetune.py
+```
+
+You can change the arguments in `TTSDatasetArguments` or `TTSModelArguments` in the `finetune.py` passed to Intel Extension For Transformers TTS interface to customize your finetuning process.
+
+## Inference
+
+Now we have our finetuned model `finetuned_model.pt`, so let us check the quality and performance of that. We can run the following command:
+```
+python inference.py
+```
+
+Then you will see a prompt in the console `Write a sentence to let the talker speak:`, and you can enter your input text in the console and press ENTER on your keyboard to generate the speech with your finetuned voice.
diff --git a/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/asr.py b/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/asr.py
@@ -0,0 +1,30 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
+import argparse
+parser = argparse.ArgumentParser(
+                    prog='asr',
+                    description='Audio Speech Recognition')
+parser.add_argument('-i', '--input_audio')
+parser.add_argument('-m', '--model_name_or_path', default="openai/whisper-tiny")
+parser.add_argument('-d', '--device', default="cuda")
+args = parser.parse_args()
+asr = AudioSpeechRecognition(model_name_or_path=args.model_name_or_path, device=args.device)
+text = asr.audio2text(args.input_audio)
+print(text)
diff --git a/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/finetune.py b/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/finetune.py
@@ -0,0 +1,34 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.finetuning.tts_finetuning import TTSFinetuning
+from intel_extension_for_transformers.neural_chat.config import TTSFinetuningConfig, TTSDatasetArguments, TTSModelArguments
+import torch
+import os
+
+workdir = os.getcwd()
+data_args = TTSDatasetArguments(audio_folder_path=os.path.join(workdir, "audios"),
+                                text_folder_path=os.path.join(workdir, "texts"),)
+model_args = TTSModelArguments(step=1000, warmup_step=125, learning_rate=1e-5)
+finetuning_config = TTSFinetuningConfig(data_args, model_args)
+
+tts_fintuner = TTSFinetuning(finetuning_config=finetuning_config)
+finetuned_model = tts_fintuner.finetune()
+
+torch.save(finetuned_model, "finetuned_model.pt")
+
+
diff --git a/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/inference.py b/intel_extension_for_transformers/neural_chat/examples/tts_finetuning/inference.py
@@ -0,0 +1,134 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
+from datasets import load_dataset, Audio, Dataset, Features, ClassLabel
+import os
+import torch
+from speechbrain.pretrained import EncoderClassifier
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+from transformers import SpeechT5HifiGan
+import soundfile as sf
+from datetime import datetime
+from num2words import num2words
+
+workdir = os.getcwd()
+
+model = torch.load("finetuned_model.pt")
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+
+spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+speaker_model = EncoderClassifier.from_hparams(
+    source=spk_model_name,
+    run_opts={"device": device},
+    savedir=os.path.join("/tmp", spk_model_name)
+)
+def create_speaker_embedding(waveform):
+    with torch.no_grad():
+        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
+        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+    return speaker_embeddings
+
+audio_dataset = Dataset.from_dict({"audio": [os.path.join(workdir, "audios/samples_mp3_ted_speakers_FeiFeiLi_sample-0.mp3")]}).cast_column("audio", Audio(sampling_rate=16000))
+sembeddings = create_speaker_embedding(audio_dataset[0]["audio"]['array'])
+speaker_embeddings = torch.tensor(sembeddings).unsqueeze(0)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+
+def correct_abbreviation(text):
+    correct_dict = {
+        "A": "Eigh",
+        "B": "bee",
+        "C": "cee",
+        "D": "dee",
+        "E": "yee",
+        "F": "ef",
+        "G": "jee",
+        "H": "aitch",
+        "I": "I",
+        "J": "jay",
+        "K": "kay",
+        "L": "el",
+        "M": "em",
+        "N": "en",
+        "O": "o",
+        "P": "pee",
+        "Q": "cue",
+        "R": "ar",
+        "S": "ess",
+        "T": "tee",
+        "U": "u",
+        "V": "vee",
+        "W": "doubleliu",
+        "X": "ex",
+        "Y": "wy",
+        "Z": "zed"
+    }
+    words = text.split()
+    results = []
+    for idx, word in enumerate(words):
+        if word.isupper():
+            for c in word:
+                if c in correct_dict:
+                    results.append(correct_dict[c])
+                else:
+                    results.append(c)
+        else:
+            results.append(word)
+    return " ".join(results)
+
+def correct_number(text):
+    """Ignore the year or other exception right now"""
+    words = text.split()
+    results = []
+    for idx, word in enumerate(words):
+        if word.isdigit():
+            try:
+                word = num2words(word)
+            except Exception as e:
+                print(f"num2words fail with word: {word} and exception: {e}")
+        else:
+            try:
+                val = int(word)
+                word = num2words(word)
+            except ValueError:
+                try:
+                    val = float(word)
+                    word = num2words(word)
+                except ValueError:
+                    pass
+        results.append(word)
+    return " ".join(results)
+
+
+while True:
+    try:
+        text = input("Write a sentence to let the talker speak:\n")
+        text = correct_abbreviation(text)
+        text = correct_number(text)
+        inputs = processor(text=text, return_tensors="pt")
+        spectrogram = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device))
+        with torch.no_grad():
+            speech = vocoder(spectrogram)
+        now = datetime.now()
+        time_stamp = now.strftime("%d_%m_%Y_%H_%M_%S")
+        sf.write(f"output_{time_stamp}.wav", speech.cpu().numpy(), samplerate=16000)
+    except Exception as e:
+        print(f"Catch exception: {e}")
+        print("Restarting\n")
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-0.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-0.txt
@@ -0,0 +1 @@
+a cramp is no small danger on a swim
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-1.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-1.txt
@@ -0,0 +1 @@
+he said the same phrase thirty times
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-2.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-2.txt
@@ -0,0 +1 @@
+plus the bright rows without least
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-3.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-3.txt
@@ -0,0 +1 @@
+two plus seven is less than ten
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-4.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-4.txt
@@ -0,0 +1 @@
+the glow deepened in the eyes of the sweet girl
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-5.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-5.txt
@@ -0,0 +1 @@
+bring your problems to the wise chief
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-6.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-6.txt
@@ -0,0 +1 @@
+write a fond note to the friend you cherish
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-7.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-7.txt
@@ -0,0 +1 @@
+clothes and lodging are free to new men
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-8.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-8.txt
@@ -0,0 +1 @@
+we from an events take a bad turn
diff --git a/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-9.txt b/.../neural_chat/examples/tts_finetuning/texts/samples_mp3_ted_speakers_FeiFeiLi_sample-9.txt
@@ -0,0 +1 @@
+port is a strong line with a smoky taste
diff --git a/...xtension_for_transformers/neural_chat/pipeline/plugins/audio/finetuning/tts_finetuning.py b/...xtension_for_transformers/neural_chat/pipeline/plugins/audio/finetuning/tts_finetuning.py
@@ -81,8 +81,7 @@ def __init__(self, finetuning_config: TTSFinetuningConfig):
         )
         self.audio_folder_path = self.dataset_args.audio_folder_path
         self.text_folder_path = self.dataset_args.text_folder_path
-        self.gender = self.dataset_args.gender
-        self.language = self.dataset_args.language
+
         self.step = self.model_args.step
         self.warmup_step = self.model_args.warmup_step
         self.learning_rate = self.model_args.learning_rate
@@ -91,28 +90,23 @@ def __init__(self, finetuning_config: TTSFinetuningConfig):
         self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.language_lst = ['en', 'de', 'fr', 'es', 'pl', 'it', 'ro', 
-                            'hu', 'cs', 'nl', 'fi', 'hr', 'sk', 'sl', 'et', 'lt', 'en_accented']
+
     def _construct_text_list(self):
-        try:
-            text_paths = sorted(os.listdir(self.text_folder_path),
-                    key=lambda i: int(os.path.splitext(os.path.basename(i))[0]))
-        except ValueError as e:
-            raise(f"Please make sure that your texts under {self.text_folder_path} are named like 1.txt, 2.txt...")
+        audio_names = os.listdir(self.audio_folder_path)
         texts = []
-        for p in text_paths:
-            with open(os.path.join(self.text_folder_path, p)) as f:
-                texts.append(f.read())
+        for audio_name in audio_names:
+            if audio_name.split(".")[-1] in ["mp3", "wav"]:
+                text_name = f"{audio_name.split('.')[0]}.txt"
+                with open(os.path.join(self.text_folder_path, text_name)) as f:
+                    texts.append(f.read())
+            else:
+                raise Exception("Check your audio folder! Currently only mp3 or wav files are supported!")
         normalized_texts = [i.lower().replace(",","").replace(".", "") + "." for i in texts]
         return texts, normalized_texts
 
     def _construct_audio_list(self):
-        try:
-            audio_paths = sorted(os.listdir(self.audio_folder_path),
-                                 key=lambda i: int(os.path.splitext(os.path.basename(i))[0]))
-        except ValueError as e:
-            raise(f"Please make sure that your audios under {self.audio_folder_path} are named like 1.wav, 2.wav...")
-        audio_paths = [os.path.join(self.audio_folder_path, i) for i in audio_paths]
+        audio_names = os.listdir(self.audio_folder_path)
+        audio_paths = [os.path.join(self.audio_folder_path, i) for i in audio_names]
         return audio_paths
 
     def _construct_finetuning_dataset(self):
@@ -123,18 +117,10 @@ def _construct_finetuning_dataset(self):
         L = len(audio_paths)
         dataset = Dataset.from_dict({
             "audio_id": [f"id{i+1}" for i in range(L)],
-            "language": [self.language_lst.index(self.language) for i in range(L)],
             "audio": audio_paths,
             'raw_text': raw_texts,
-            'normalized_text': normalized_texts,
-            'gender': [self.gender for i in range(L)],
-            'speaker_id': ['10001' for i in range(L)],
-            "is_gold_transcript": [True for i in range(L)],
-            "accent": ["None" for i in range(L)]}).cast_column(
-                "audio", Audio(sampling_rate=16000)).cast_column(
-                    "language", ClassLabel(names=['en', 'de', 'fr', 'es', 'pl', 'it', 'ro', 
-                                                    'hu', 'cs', 'nl', 'fi', 'hr', 'sk', 'sl', 'et', 'lt', 
-                                                    'en_accented'], id=None))
+            'normalized_text': normalized_texts,}).cast_column(
+                "audio", Audio(sampling_rate=16000))
         return dataset
 
     def _construct_training_arguments(self):