add talkingbot-pc code (#377)

* add talkbot-pc code * fix llm int4 in talkingbot --------- Co-authored-by: VincyZhang <wenxin.zhang@intel.com>
intel · Sep 26, 2023 · be2a267 · be2a267
1 parent 53fbeb4
commit be2a267
Show file tree

Hide file tree

Showing 3 changed files with 247 additions and 4 deletions.
diff --git a/...xtension_for_transformers/neural_chat/examples/talkingbot_pc/build_talkingbot_on_pc.ipynb b/...xtension_for_transformers/neural_chat/examples/talkingbot_pc/build_talkingbot_on_pc.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# EndToEnd TalkingBot on PC client (Windows)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> make sure you are running in a conda environment with Python 3.10\n",
+    "\n",
+    "[Intel® Extension for Transformers Neural Chat](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat) provides a lot of plugins to meet different users' scenarios. In this notebook we will show you how to create a TalkingBot on your local laptop with **Intel CPU** (no GPU needed).\n",
+    "\n",
+    "Behind the scene, a TalkingBot is composed of a pipeline of\n",
+    "1. recognize user's prompt audio and convert to text\n",
+    "2. text understanding and question answering by Large Language Models\n",
+    "2. convert answer text to speech\n",
+    "\n",
+    "This is a notebook to let you know how to create such a TalkingBot on PC."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Audio To Text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "Audio(r\"./sample.wav\", rate=16000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "asr = AudioSpeechRecognition(model_name_or_path=\"openai/whisper-tiny\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = asr.audio2text(r\"./sample.wav\")\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Directly load given int4 model to do inference\n",
+    "\n",
+    "Here for quick demo, we just use a given int4 model to generate text. If you want to convert your int4 model manually, please refer to next cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.llm.runtime.graph import Model\n",
+    "model = Model()\n",
+    "model.bin_file = r\"mpt_q4_0.bin\"\n",
+    "model.init_from_bin(\"mpt\", model.bin_file, max_new_tokens=32, seed=12)\n",
+    "prompt = text\n",
+    "output = model.generate(prompt)\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Convert int4 model to do inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig\n",
+    "model_name = r\"THUDM/ChatGLM2-6B\"\n",
+    "woq_config = WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4\")\n",
+    "model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True, trust_remote_code=True)\n",
+    "prompt = text\n",
+    "output = model.generate(prompt, max_new_tokens=32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text To Speech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tts = TextToSpeech()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_path = tts.text2speech(\"Hello there, I am your Talking Bot!\", \"output.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "Audio(result_path, rate=16000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "itrex",
+   "language": "python",
+   "name": "itrex"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -46,10 +46,14 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         self.output_audio_path = output_audio_path
         self.stream_mode = stream_mode
         self.spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
-        self.speaker_model = EncoderClassifier.from_hparams(
-            source=self.spk_model_name,
-            run_opts={"device": "cpu"},
-            savedir=os.path.join("/tmp", self.spk_model_name))
+        try:
+            self.speaker_model = EncoderClassifier.from_hparams(
+                source=self.spk_model_name,
+                run_opts={"device": "cpu"},
+                savedir=os.path.join("/tmp", self.spk_model_name))
+        except Exception as e:
+            print(f"[TTS Warning] speaker model fail to load, so speaker embedding creating is disabled.")
+            self.speaker_model = None
         self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
         self.vocoder.eval()
         script_dir = os.path.dirname(os.path.abspath(__file__))
@@ -89,6 +93,8 @@ def create_speaker_embedding(self, driven_audio_path):
 
         driven_audio_path: the driven audio of that speaker
         """
+        if self.speaker_model is None:
+            raise Exception("Unable to create a speaker embedding! Please check the speaker model.")
         audio_dataset = Dataset.from_dict({"audio":
             [driven_audio_path]}).cast_column("audio", Audio(sampling_rate=16000))
         waveform = audio_dataset[0]["audio"]['array']

diff --git a/intel_extension_for_transformers/neural_chat/requirements_pc.txt b/intel_extension_for_transformers/neural_chat/requirements_pc.txt
@@ -0,0 +1,39 @@
+transformers==4.32.1
+peft
+fschat
+num2words
+speechbrain
+paddlepaddle
+paddlespeech==1.4.1
+shortuuid
+gptcache
+evaluate
+pydub
+python-multipart
+PyPDF2
+langchain
+python-docx
+scikit-learn
+farm-haystack
+librosa
+beautifulsoup4
+InstructorEmbedding
+chromadb
+fastapi
+pydantic
+starlette
+yacs
+uvicorn
+optimum
+optimum-intel
+sentence_transformers
+unstructured
+markdown
+rouge_score
+openpyxl
+numpy==1.23.5
+tiktoken==0.4.0
+lm_eval
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
+torchaudio