[NeuralChat] add optimized SadTalker to Video plugin in NeuralChat (#564

) Signed-off-by: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Co-authored-by: VincyZhang <wenxin.zhang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
intel · Nov 22, 2023 · 7f24c79 · 7f24c79
1 parent 9d90e1d
commit 7f24c79
Show file tree

Hide file tree

Showing 82 changed files with 7,619 additions and 40 deletions.
diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
@@ -39,7 +39,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --max-line-length=120 \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,cv2,PIL.Image \
     --ignore-paths=/intel-extension-for-transformers/intel_extension_for_transformers/llm/runtime/graph/ \
     /intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
 exit_code=$?

diff --git a/intel_extension_for_transformers/neural_chat/chatbot.py b/intel_extension_for_transformers/neural_chat/chatbot.py
@@ -105,7 +105,10 @@ def build_chatbot(config: PipelineConfig=None):
                 elif plugin_name == "ner_int":
                     from .pipeline.plugins.ner.ner_int import NamedEntityRecognitionINT
                     plugins[plugin_name]['class'] = NamedEntityRecognitionINT
-                else:
+                elif plugin_name == "face_animation": # pragma: no cover
+                    from .pipeline.plugins.video.face_animation.sadtalker import SadTalker
+                    plugins[plugin_name]['class'] = SadTalker
+                else: # pragma: no cover
                     raise ValueError("NeuralChat Error: Unsupported plugin")
                 print(f"create {plugin_name} plugin instance...")
                 print(f"plugin parameters: ", plugin_value['args'])

diff --git a/intel_extension_for_transformers/neural_chat/examples/plugins/video/README.md b/intel_extension_for_transformers/neural_chat/examples/plugins/video/README.md
@@ -0,0 +1,34 @@
+# Face Animation
+
+We optimize SadTalker on Intel Xeon CPU and integrate its face animation functionalities into the video plugin of NeuralChat.
+
+## Prepare Environment
+
+```
+conda install ffmpeg
+pip install ffmpeg-python
+pip install -r requirements.txt
+```
+
+## Prepare Models
+
+```
+bash download_models.sh checkpoints gfpgan/weights
+```
+
+## Usage
+
+### Simply run the test script
+```
+python main.py
+```
+
+### Deploy it as a server
+
+```
+neuralchat_server start --config_file face_animation.yaml
+```
+
+## Acknowledgements
+
+This plugin is mostly adapted from [SadTalker](https://github.com/OpenTalker/SadTalker). We thank the related authors for their great work!
diff --git a/intel_extension_for_transformers/neural_chat/examples/plugins/video/download_models.sh b/intel_extension_for_transformers/neural_chat/examples/plugins/video/download_models.sh
@@ -0,0 +1,35 @@
+
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mkdir ./checkpoints  
+
+wget -nc https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00109-model.pth.tar -O  ./checkpoints/mapping_00109-model.pth.tar
+wget -nc https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00229-model.pth.tar -O  ./checkpoints/mapping_00229-model.pth.tar
+wget -nc https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_256.safetensors -O  ./checkpoints/SadTalker_V0.0.2_256.safetensors
+wget -nc https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_512.safetensors -O  ./checkpoints/SadTalker_V0.0.2_512.safetensors
+
+
+# wget -nc https://github.com/Winfredy/SadTalker/releases/download/v0.0.2/BFM_Fitting.zip -O ./checkpoints/BFM_Fitting.zip
+# unzip -n ./checkpoints/BFM_Fitting.zip -d ./checkpoints/
+
+mkdir -p ./gfpgan/weights
+wget -c https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth -O ./gfpgan/weights/alignment_WFLW_4HG.pth 
+wget -c https://github.com/xinntao/facexlib/releases/download/v0.1.0/detection_Resnet50_Final.pth -O ./gfpgan/weights/detection_Resnet50_Final.pth 
+wget -nc https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth -O ./gfpgan/weights/GFPGANv1.4.pth 
+wget -nc https://github.com/xinntao/facexlib/releases/download/v0.2.2/parsing_parsenet.pth -O ./gfpgan/weights/parsing_parsenet.pth 
+
diff --git a/intel_extension_for_transformers/neural_chat/examples/plugins/video/face_animation.yaml b/intel_extension_for_transformers/neural_chat/examples/plugins/video/face_animation.yaml
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the parameter configuration file for NeuralChat Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 9001
+
+model_name_or_path: "facebook/opt-125m"
+device: "cpu"
+
+tts:
+    enable: true
+    args:
+        device: "cpu"
+        voice: "default"
+        stream_mode: false
+        output_audio_path: "./output_audio.wav"
+
+face_animation:
+    enable: true
+    args:
+        device: "cpu"
+        bf16: true
+        p_num: 4
+        enhancer: "gfpgan"
+
+# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune']
+tasks_list: ['faceanimation']
diff --git a/intel_extension_for_transformers/neural_chat/examples/plugins/video/main.py b/intel_extension_for_transformers/neural_chat/examples/plugins/video/main.py
@@ -0,0 +1,39 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.video.face_animation.sadtalker import SadTalker
+import requests
+import torch
+
+
+sample_audio_url = "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/welcome.wav"
+sample_img_url = "https://raw.githubusercontent.com/OpenTalker/SadTalker/main/examples/source_image/full_body_2.png"
+img_data = requests.get(sample_img_url).content
+source_image = "sample_img.jpg"
+driven_audio = "sample_audio.wav"
+with open(source_image, 'wb') as f:
+    f.write(img_data)
+audio_data = requests.get(sample_audio_url).content
+with open(driven_audio, 'wb') as f:
+    f.write(audio_data)
+output_video_path = "response.mp4"
+checkpoint_dir = "checkpoints"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+sadtalker = SadTalker(device=device, checkpoint_dir=checkpoint_dir, bf16=True, p_num=4, enhancer=None, output_video_path=output_video_path)
+
+sadtalker.convert(source_image=source_image, driven_audio=driven_audio)
+
diff --git a/intel_extension_for_transformers/neural_chat/models/base_model.py b/intel_extension_for_transformers/neural_chat/models/base_model.py
@@ -58,6 +58,7 @@ def __init__(self):
         self.model_name = ""
         self.asr = None
         self.tts = None
+        self.face_animation = None
         self.audio_input_path = None
         self.audio_output_path = None
         self.retriever = None
@@ -287,6 +288,31 @@ def chat(self, query, config=None):
         """
         return self.predict(query=query, config=config)
 
+    def face_animate(self, image_path, audio_path=None, text=None, voice=None) -> str:  # pragma: no cover
+        # 1) if there is a driven audio, then image + audio
+        # 2) if there is no driven audio but there is a input text, then first TTS and then image + audio
+        if audio_path:
+            plugin_name = "face_animation"
+            if is_plugin_enabled(plugin_name):
+                plugin_instance = get_plugin_instance(plugin_name)
+                video_path = plugin_instance.convert(source_image=image_path, driven_audio=audio_path)
+            else:
+                raise Exception("Please specify the face_animation plugin!")
+        elif text:
+            plugin_name = "tts"
+            if is_plugin_enabled("tts"):
+                plugin_name = "tts"
+            elif  is_plugin_enabled("tts_chinese"):
+                plugin_name = "tts_chinese"
+            else:
+                raise Exception("Please specify the TTS plugin!")
+            plugin_instance = get_plugin_instance(plugin_name)
+            audio_path = plugin_instance.text2speech(text, "tmp_audio.wav", voice=voice)
+            plugin_instance = get_plugin_instance("face_animation")
+            video_path = plugin_instance.convert(source_image=image_path, driven_audio=audio_path)
+            os.remove(audio_path)
+        return video_path
+
     def get_default_conv_template(self, model_path: str) -> Conversation:
         """
         Get the default conversation template for the given model path.
@@ -354,6 +380,8 @@ def register_plugin_instance(self, plugin_name, instance):
             self.cache = instance
         if plugin_name == "safety_checker":
             self.safety_checker = instance
+        if plugin_name == "face_animation": # pragma: no cover
+            self.face_animation = instance
 
 
 # A global registry for all model adapters

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/asr.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/asr.py
@@ -17,7 +17,7 @@
 
 import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
-from datasets import load_dataset, Audio, Dataset
+from datasets import Audio, Dataset
 import time
 import contextlib
 from pydub import AudioSegment
@@ -48,7 +48,7 @@ def _audiosegment_to_librosawav(self, audiosegment):
 
         return fp_arr
 
-    def _convert_audio_type(self, audio_path):
+    def _convert_audio_type(self, audio_path): # pragma: no cover
         print("[ASR WARNING] Recommend to use mp3 or wav input audio type!")
         audio_file_name = audio_path.split(".")[0]
         AudioSegment.from_file(audio_path).export(f"{audio_file_name}.mp3", format="mp3")

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -25,6 +25,7 @@
 import soundfile as sf
 import numpy as np
 import contextlib
+from pydub import AudioSegment
 
 from .utils.english_normalizer import EnglishNormalizer
 from .utils.reduce_noise import NoiseReducer
@@ -36,7 +37,7 @@ class TextToSpeech():
     3) Customized voice (Original model + User's customized input voice embedding)
     """
     def __init__(self, output_audio_path="./response.wav", voice="default", stream_mode=False, device="cpu",
-                 reduce_noise=False):
+                 reduce_noise=True):
         """Make sure your export LD_PRELOAD=<path to libiomp5.so and libtcmalloc> beforehand."""
         # default setting
         self.device = device
@@ -65,8 +66,17 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         elif os.path.exists('spk_embed_default.pt'):    # for notebook
             self.default_speaker_embedding = torch.load('spk_embed_default.pt')
         else: # pragma: no cover
-            print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.")
-            self.default_speaker_embedding = torch.zeros((1, 512))
+            import subprocess
+            try:
+                p = subprocess.Popen(["wget",
+                                "https://github.com/intel/intel-extension-for-transformers/raw/main/"
+                                "intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/"
+                                "spk_embed_default.pt"])
+                p.wait()
+                self.default_speaker_embedding = torch.load('spk_embed_default.pt')
+            except Exception as e:
+                print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.")
+                self.default_speaker_embedding = torch.zeros((1, 512))
 
         # preload the demo model in case of time-consuming runtime loading
         self.demo_model = None
@@ -81,16 +91,33 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         self.normalizer = EnglishNormalizer()
         self.noise_reducer = NoiseReducer() if reduce_noise else None
 
+    def _audiosegment_to_librosawav(self, audiosegment):
+        # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples
+        # This way is faster than librosa.load or HuggingFace Dataset wrapper
+        channel_sounds = audiosegment.split_to_mono()[:1]   # only select the first channel
+        samples = [s.get_array_of_samples() for s in channel_sounds]
+
+        fp_arr = np.array(samples).T.astype(np.float32)
+        fp_arr /= np.iinfo(samples[0].typecode).max
+        fp_arr = fp_arr.reshape(-1)
+
+        return fp_arr
+
     def create_speaker_embedding(self, driven_audio_path):
         """Create the speaker's embedding.
 
         driven_audio_path: the driven audio of that speaker
         """
         if self.speaker_model is None:
             raise Exception("Unable to create a speaker embedding! Please check the speaker model.")
-        audio_dataset = Dataset.from_dict({"audio":
-            [driven_audio_path]}).cast_column("audio", Audio(sampling_rate=16000))
-        waveform = audio_dataset[0]["audio"]['array']
+        try:
+            waveform = AudioSegment.from_file(driven_audio_path).set_frame_rate(16000)
+            waveform = self._audiosegment_to_librosawav(waveform)
+        except Exception as e:
+            print(f"[TTS] audiosegment to librosa wave fail: {e}")
+            audio_dataset = Dataset.from_dict({"audio":
+                [driven_audio_path]}).cast_column("audio", Audio(sampling_rate=16000))
+            waveform = audio_dataset[0]["audio"]['array']
         with torch.no_grad():
             speaker_embeddings = self.speaker_model.encode_batch(torch.tensor(waveform).to("cpu"))
             speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # [1,1,512]
@@ -191,7 +218,7 @@ def stream_text2speech(self, generator, output_audio_path, voice="default"):
     def post_llm_inference_actions(self, text_or_generator):
         from intel_extension_for_transformers.neural_chat.plugins import plugins
         self.voice = plugins.tts.args["voice"]
-        if self.stream_mode:
+        if self.stream_mode: # pragma: no cover
             def cache_words_into_sentences():
                 buffered_texts = []
                 hitted_ends = ['.', '!', '?', ';', ':']

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/reduce_noise.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/reduce_noise.py
@@ -16,9 +16,9 @@
 # limitations under the License.
 
 import torch
+# pylint: disable=E1102
 from torch.nn.functional import conv1d, conv2d
 from typing import Union, Optional
-import torch
 from torch.types import Number
 from scipy.io import wavfile
 from pydub import AudioSegment

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/video/__init__.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/video/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...on_for_transformers/neural_chat/pipeline/plugins/video/face_animation/README.md b/...on_for_transformers/neural_chat/pipeline/plugins/video/face_animation/README.md
@@ -0,0 +1,30 @@
+# Face Animation
+
+We optimize SadTalker on Intel Xeon CPU and integrate its face animation functionalities into the video plugin of NeuralChat.
+
+## Prepare Environment
+
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/pipeline/plugins/video/face_animation
+conda install ffmpeg
+pip install ffmpeg-python
+pip install -r requirements.txt
+```
+
+## Usage
+
+```python
+from sadtalker import SadTalker
+device = "cuda" if torch.cuda.is_available() else "cpu"
+sadtalker = SadTalker(device=device)
+# without enhancer
+sadtalker.convert(source_image=source_image, driven_audio=driven_audio, output_video_path="./response.mp4",
+                bf16=True, result_dir="./results", p_num=4, enhancer=None)
+# with enhancer
+sadtalker.convert(source_image=source_image, driven_audio=driven_audio, output_video_path="./response.mp4",
+                bf16=True, result_dir="./results", p_num=4, enhancer='gfpgan')
+```
+
+## Acknowledgements
+
+This plugin is mostly adapted from [SadTalker](https://github.com/OpenTalker/SadTalker). We thank the related authors for their great work!