Skip to content

Commit

Permalink
[NeuralChat] Fix UT issues on Nvidia GPU (#829)
Browse files Browse the repository at this point in the history
Fix UT issues on Nvidia GPU

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
  • Loading branch information
lvliang-intel committed Nov 30, 2023
1 parent 674e934 commit 464962e
Show file tree
Hide file tree
Showing 25 changed files with 75 additions and 50 deletions.
3 changes: 2 additions & 1 deletion intel_extension_for_transformers/neural_chat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ def __init__(self,
WeightOnlyQuantConfig,
BitsAndBytesConfig
)
self.optimization_config = optimization_config if optimization_config is not None else MixedPrecisionConfig()
self.optimization_config = optimization_config if optimization_config is not None else \
MixedPrecisionConfig(dtype="float16" if self.device == "cuda" else "bfloat16")
assert type(self.optimization_config) in [MixedPrecisionConfig, WeightOnlyQuantConfig, BitsAndBytesConfig], \
f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \
" or BitsAndBytesConfig,got {type(self.optimization_config)}."
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@
import contextlib
from pydub import AudioSegment
import numpy as np
from intel_extension_for_transformers.neural_chat.utils.common import get_device_type

class AudioSpeechRecognition():
"""Convert audio to text."""
def __init__(self, model_name_or_path="openai/whisper-small", bf16=False, language=None, device="cpu"):
if device == "auto":
device = get_device_type()
self.device = device
self.model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(self.device)
self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

from .utils.english_normalizer import EnglishNormalizer
from .utils.reduce_noise import NoiseReducer
from intel_extension_for_transformers.neural_chat.utils.common import get_device_type

class TextToSpeech():
"""Convert text to speech with a driven speaker embedding
Expand All @@ -40,6 +42,8 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
reduce_noise=False):
"""Make sure your export LD_PRELOAD=<path to libiomp5.so and libtcmalloc> beforehand."""
# default setting
if device == "auto":
device = get_device_type()
self.device = device
self.original_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License.

import os
import torch
import unittest
from intel_extension_for_transformers.neural_chat.chatbot import build_chatbot, optimize_model
from intel_extension_for_transformers.neural_chat.config import (
Expand Down Expand Up @@ -91,7 +92,7 @@ def test_voice_chat(self):
self.assertTrue(os.path.exists("./response.wav"))

def test_quantization(self):
config = MixedPrecisionConfig()
config = MixedPrecisionConfig(dtype="float16" if torch.cuda.is_available() else "bfloat16")
model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
low_cpu_mem_usage=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import unittest
import os
from intel_extension_for_transformers.neural_chat import build_chatbot
from intel_extension_for_transformers.neural_chat import PipelineConfig, GenerationConfig
from intel_extension_for_transformers.neural_chat import PipelineConfig
from intel_extension_for_transformers.neural_chat import plugins

# All UT cases use 'facebook/opt-125m' to reduce test time.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
from intel_extension_for_transformers.neural_chat.config import PipelineConfig
from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig
from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig
from intel_extension_for_transformers.neural_chat.utils.common import get_device_type

class TestChatbotBuilder(unittest.TestCase):
def setUp(self):
self.device = get_device_type()
return super().setUp()

def tearDown(self) -> None:
Expand All @@ -42,7 +44,8 @@ def tearDown(self) -> None:

def test_build_chatbot_with_AMP(self):
config = PipelineConfig(model_name_or_path="facebook/opt-125m",
optimization_config = MixedPrecisionConfig())
optimization_config = MixedPrecisionConfig(
dtype="float16" if torch.cuda.is_available() else "bfloat16"))
chatbot = build_chatbot(config)
self.assertIsNotNone(chatbot)
response = chatbot.predict(query="Tell me about Intel Xeon Scalable Processors.")
Expand All @@ -54,6 +57,8 @@ def test_build_chatbot_with_AMP(self):
self.assertIsNotNone(response)

def test_build_chatbot_with_bitsandbytes_quant(self):
if torch.cuda.is_available():
os.system("pip install bitsandbytes")
if is_bitsandbytes_available() and torch.cuda.is_available():
config = PipelineConfig(
model_name_or_path="facebook/opt-125m",
Expand All @@ -72,16 +77,17 @@ def test_build_chatbot_with_bitsandbytes_quant(self):
self.assertIsNotNone(response)

def test_build_chatbot_with_weight_only_quant(self):
loading_config = LoadingModelConfig(use_llm_runtime=False)
config = PipelineConfig(model_name_or_path="facebook/opt-125m",
optimization_config=WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="int4_fullrange"),
loading_config=loading_config
)
chatbot = build_chatbot(config)
self.assertIsNotNone(chatbot)
response = chatbot.predict(query="Tell me about Intel Xeon Scalable Processors.")
print(response)
self.assertIsNotNone(response)
if self.device == "cpu":
loading_config = LoadingModelConfig(use_llm_runtime=False)
config = PipelineConfig(model_name_or_path="facebook/opt-125m",
optimization_config=WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="int4_fullrange"),
loading_config=loading_config
)
chatbot = build_chatbot(config)
self.assertIsNotNone(chatbot)
response = chatbot.predict(query="Tell me about Intel Xeon Scalable Processors.")
print(response)
self.assertIsNotNone(response)

if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@

import unittest
import re, os
import torch
from intel_extension_for_transformers.neural_chat import build_chatbot
from intel_extension_for_transformers.neural_chat.config import PipelineConfig
from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig
from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig
from intel_extension_for_transformers.neural_chat.utils.common import get_device_type

class TestChatbotBuilder(unittest.TestCase):
def setUp(self):
self.device = get_device_type()
if self.device != "cpu":
self.skipTest("Skipping this test since LLM runtime optimization is for Intel CPU.")
return super().setUp()

def tearDown(self) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 6000

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

retrieval:
enable: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ host: 127.0.0.1
port: 7777

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"
plugin_as_service: true

asr:
enable: true
args:
# support cpu, hpu, xpu, cuda
device: "cpu"
device: "auto"
# support openai/whisper series
model_name_or_path: "openai/whisper-small"
# only can be set to true when the device is set to "cpu"
Expand All @@ -40,7 +40,7 @@ asr:
tts:
enable: true
args:
device: "cpu"
device: "auto"
voice: "default"
stream_mode: false
output_audio_path: "./output_audio.wav"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
class UnitTest(unittest.TestCase):
def setUp(self) -> None:
if not (is_bitsandbytes_available() and torch.cuda.is_available()):
self.skipTest("Skipping this test on CPU.")
self.skipTest("Only test this UT case on Nvidia GPU.")
yaml_file_path = "/intel-extension-for-transformers/" + \
"intel_extension_for_transformers/neural_chat/tests/ci/server/textchat_bits_and_bytes.yaml"
if os.path.exists(yaml_file_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@
import os
import json
from intel_extension_for_transformers.neural_chat.server import TextChatClientExecutor

from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
class UnitTest(unittest.TestCase):
def setUp(self) -> None:
device = get_device_type()
if device != "cpu":
self.skipTest("Only test this UT case on Intel CPU.")

yaml_file_path = "/intel-extension-for-transformers/" + \
"intel_extension_for_transformers/neural_chat/tests/ci/server/textchat_ipex_int8.yaml"
if os.path.exists(yaml_file_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@
import os
import json
from intel_extension_for_transformers.neural_chat.server import TextChatClientExecutor

from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
class UnitTest(unittest.TestCase):
def setUp(self) -> None:
device = get_device_type()
if device != "cpu":
self.skipTest("Only test this UT case on Intel CPU.")

yaml_file_path = "/intel-extension-for-transformers/" + \
"intel_extension_for_transformers/neural_chat/tests/ci/server/textchat_itrex_int4.yaml"
if os.path.exists(yaml_file_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@
import os
import json
from intel_extension_for_transformers.neural_chat.server import TextChatClientExecutor

from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
class UnitTest(unittest.TestCase):
def setUp(self) -> None:
device = get_device_type()
if device != "cpu":
self.skipTest("Only test this UT case on Intel CPU.")

yaml_file_path = "/intel-extension-for-transformers/" + \
"intel_extension_for_transformers/neural_chat/tests/ci/server/textchat_itrex_llm_runtime_int4.yaml"
if os.path.exists(yaml_file_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@
import os
import json
from intel_extension_for_transformers.neural_chat.server import TextChatClientExecutor

from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
class UnitTest(unittest.TestCase):
def setUp(self) -> None:
device = get_device_type()
if device != "cpu":
self.skipTest("Only test this UT case on Intel CPU.")

yaml_file_path = "/intel-extension-for-transformers/" + \
"intel_extension_for_transformers/neural_chat/tests/ci/server/textchat_mix_precision.yaml"
if os.path.exists(yaml_file_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 7000

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune']
tasks_list: ['textchat']
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 6060

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# bits and bytes
optimization:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 7070

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# ipex int8 optimization
optimization:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 8080

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# itrex int4 optimization
optimization:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 9090

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# itrex int4 llm runtime optimization
optimization:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 5000

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

# itrex int4 optimization
optimization:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ host: 127.0.0.1
port: 8000

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

retrieval:
enable: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ host: 127.0.0.1
port: 9000

model_name_or_path: "facebook/opt-125m"
device: "cpu"
device: "auto"

asr:
enable: true
args:
# support cpu, hpu, xpu, cuda
device: "cpu"
device: "auto"
# support openai/whisper series
model_name_or_path: "openai/whisper-small"
# only can be set to true when the device is set to "cpu"
Expand All @@ -39,7 +39,7 @@ asr:
tts:
enable: true
args:
device: "cpu"
device: "auto"
voice: "default"
stream_mode: true
output_audio_path: "./output_audio.wav"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
TextGenerationFinetuningConfig,
)
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
from intel_extension_for_transformers.neural_chat.utils.common import get_device_type

json_data = \
"""
Expand All @@ -39,6 +40,7 @@
class TestFinetuning(unittest.TestCase):
@classmethod
def setUpClass(self):
self.device = get_device_type()
with open(test_data_file, mode='w') as f:
f.write(json_data)

Expand All @@ -56,7 +58,7 @@ def test_finetune_clm(self):
max_steps=3,
overwrite_output_dir=True
)
finetune_args = FinetuningArguments(device='cpu')
finetune_args = FinetuningArguments(device=self.device)
finetune_cfg = TextGenerationFinetuningConfig(
model_args=model_args,
data_args=data_args,
Expand All @@ -74,7 +76,7 @@ def test_finetune_clm_qlora(self):
max_steps=3,
overwrite_output_dir=True
)
finetune_args = FinetuningArguments(device='cpu', qlora=True)
finetune_args = FinetuningArguments(device=self.device, qlora=True)
finetune_cfg = TextGenerationFinetuningConfig(
model_args=model_args,
data_args=data_args,
Expand All @@ -92,7 +94,7 @@ def test_finetune_seq2seq(self):
max_steps=3,
overwrite_output_dir=True
)
finetune_args = FinetuningArguments(device='cpu')
finetune_args = FinetuningArguments(device=self.device)
finetune_cfg = TextGenerationFinetuningConfig(
model_args=model_args,
data_args=data_args,
Expand Down

0 comments on commit 464962e

Please sign in to comment.