Skip to content

Commit

Permalink
[NeuralChat] Configure TGI endpoint from YAML (#1321)
Browse files Browse the repository at this point in the history
* update tgi endpoint

Signed-off-by: LetongHan <letong.han@intel.com>
  • Loading branch information
letonghan committed Feb 29, 2024
1 parent ffa8f3c commit 525ea86
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ You can customize the configuration file 'tgi.yaml' to match your environment se
| model_name_or_path | "./neural-chat-7b-v3-1" |
| device | "cpu"/"gpu"/"hpu" |
| serving.framework | "tgi" |
| serving.framework.tgi_engine_params.endpoint | Your existed tgi service endpoint. when endpoint is set, neuralchat will not start a tgi service, and other params will not work any more. |
| serving.framework.tgi_engine_params.port | 9876, the port that neuralchat will help to start tgi service. |
| serving.framework.tgi_engine_params.sharded | true (false only on cpu) |
| serving.framework.tgi_engine_params.num_shard | 4 (not effective when sharded is false) |
| serving.framework.tgi_engine_params.habana_visible_devices | "0,1" (only on hpu) |
Expand Down Expand Up @@ -90,3 +92,8 @@ curl ${your_ip}:${your_port}/v1/tgi/generate \
```

Of course, you can also consume the service via `postman`, `http request`, or other ways.

If neuralchat is unable to call your local tgi service, try the command below then try again.
```bash
unset http_proxy
```
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ device: "auto"
serving:
framework: "tgi"
tgi_engine_params:
# when endpoint is set, neuralchat will not start a tgi service,
# and other params will not work
endpoint: "http://0.0.0.0:9876/"
port: "9876"
# not supported on CPU
sharded: true
num_shard: 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,53 +132,59 @@ def init(self, config):
# TGI serving
elif serving_framework == "tgi":
tgi_params = serving.get("tgi_engine_params", None)
tgi_sharded = tgi_params.get('sharded', False)
tgi_num_shard = tgi_params.get('num_shard', 1)
tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
# construct tgi command
tgi_cmd = "docker run -p 9876:80 --name tgi_service -v ./data:/data"
if device == "cpu":
tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
# sharded is not supported on CPU
if tgi_sharded:
tgi_sharded = False
elif device == "gpu":
tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
pass
elif device == "hpu":
create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
cd tgi-gaudi && docker build -t tgi_gaudi ."
tgi_endpoint = tgi_params.get('endpoint', None)
if tgi_endpoint:
logger.info(f"tgi endpoint already exist: {tgi_endpoint}")
# start a tgi service
else:
tgi_port = tgi_params.get('port', "9876")
tgi_sharded = tgi_params.get('sharded', False)
tgi_num_shard = tgi_params.get('num_shard', 1)
tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
# construct tgi command
tgi_cmd = f"docker run -p {tgi_port}:80 --name tgi_service -v ./data:/data"
if device == "cpu":
tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
# sharded is not supported on CPU
if tgi_sharded:
tgi_sharded = False
elif device == "gpu":
tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
pass
elif device == "hpu":
create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
cd tgi-gaudi && docker build -t tgi_gaudi ."
try:
# create docker image first
logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
sys.stdout.flush()
sys.stderr.flush()
subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash") # nosec
logger.info("creating tgi habana docker image...")
time.sleep(200)
except Exception as e:
raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
# add tgi_cmd
if tgi_sharded and tgi_num_shard > 1:
tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
else:
logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
raise Exception("Please specify device for tgi.")
tgi_cmd += f" --model-id {model_name_or_path}"
if tgi_sharded and tgi_num_shard > 1:
tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
# start tgi service
try:
# create docker image first
logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
sys.stdout.flush()
sys.stderr.flush()
subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash") # nosec
logger.info("creating tgi habana docker image...")
subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash") # nosec
logger.info("Building docker container...")
time.sleep(200)
except Exception as e:
raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
# add tgi_cmd
if tgi_sharded and tgi_num_shard > 1:
tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
else:
logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
raise Exception("Please specify device for tgi.")
tgi_cmd += f" --model-id {model_name_or_path}"
if tgi_sharded and tgi_num_shard > 1:
tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
# start tgi service
try:
logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
sys.stdout.flush()
sys.stderr.flush()
subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash") # nosec
logger.info("Building docker container...")
time.sleep(200)
except Exception as e:
raise RuntimeError(f"Error when building docker container: {e}")
raise RuntimeError(f"Error when building docker container: {e}")

# plugin as service
if plugin_as_service:
Expand Down Expand Up @@ -317,7 +323,14 @@ def init(self, config):
self.chatbot = build_chatbot(pipeline_config)
# init api
from .restful.api import setup_router
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
if serving and serving.get("framework") == "tgi":
if tgi_endpoint:
endpoint = tgi_endpoint
else:
endpoint = f"http://0.0.0.0:{tgi_port}/"
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port, endpoint)
else:
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
app.include_router(api_router)
return True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,16 @@
'tgi': tgi_router
}

def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, world_size=1, host="0.0.0.0", port=80):
def setup_router(
api_list,
chatbot=None,
enable_llm=True,
use_deepspeed=False,
world_size=1,
host="0.0.0.0",
port=80,
endpoint=None
):
"""Setup router for FastAPI
Args:
Expand All @@ -69,6 +78,9 @@ def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, w
if lower_api_name == "plugin_image2image":
api_router.worker.start()
logger.info("create main worker done...")
if endpoint and lower_api_name=="tgi":
api_router.set_tgi_endpoint(endpoint)
logger.info(f"set tgi endpoint: {endpoint}")
_router.include_router(api_router)
else:
logger.error(f"NeuralChat has not supported such service yet: {api_name}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ class TextGenerationAPIRouter(APIRouter):

def __init__(self) -> None:
super().__init__()
self.endpoint = "http://0.0.0.0:9876/"
self.chatbot = None

def set_tgi_endpoint(self, endpoint):
self.endpoint = endpoint

def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None:
self.chatbot = chatbot
self.use_deepspeed = use_deepspeed
Expand Down

0 comments on commit 525ea86

Please sign in to comment.