Skip to content

Commit

Permalink
Merge branch 'main' into more_flexible_tdt_durations
Browse files Browse the repository at this point in the history
  • Loading branch information
hainan-xv committed Jul 9, 2024
2 parents 15f4ee8 + 8898b76 commit faf2e7c
Show file tree
Hide file tree
Showing 7 changed files with 376 additions and 70 deletions.
2 changes: 1 addition & 1 deletion nemo/deploy/nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

use_query_llm = True
try:
from nemo.deploy.nlp.query_llm import NemoQueryLLM
from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch
except Exception:
use_query_llm = False

Expand Down
20 changes: 14 additions & 6 deletions nemo/deploy/nlp/megatronllm_deployable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import logging
from enum import IntEnum, auto
from pathlib import Path
from typing import List

import numpy as np
import torch
Expand Down Expand Up @@ -129,6 +130,12 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices:
nemo_checkpoint_filepath, trainer=trainer, return_config=True
)
# transformer_engine should always be true according to EricH, but GPT-2B model will fail if it is enabled
if not custom_config.transformer_engine:
LOGGER.warning(
"MegatronLLMDeployable expects model config transformer_engine=True, but this model has it =False. "
"Overriding it to =True, but this may break certain checkpoints converted on older Nemo versions. "
"If your model breaks, please try re-converting the checkpoint on the current Nemo version."
)
custom_config.transformer_engine = True
# using multi-gpu for tensor parallelism directly for now, could do pipeline parallel instead or a combination
custom_config.tensor_model_parallel_size = num_devices
Expand Down Expand Up @@ -233,9 +240,7 @@ def _length_params_from_triton_inputs(**inputs: np.ndarray):
length_params[length_param_field] = inputs.pop(length_param_field)[0][0]
return length_params

@batch
def triton_infer_fn(self, **inputs: np.ndarray):
"""Triton server inference function that actually runs the model"""
def generate(self, inputs: List[str], length_params: LengthParam, sampling_params: SamplingParam):
if torch.distributed.is_initialized():
distributed_rank = torch.distributed.get_rank()
if distributed_rank != 0:
Expand All @@ -245,13 +250,16 @@ def triton_infer_fn(self, **inputs: np.ndarray):
signal_value = ServerSync.SIGNAL.to_long_tensor()
torch.distributed.broadcast(signal_value, 0)

return self.model.generate(inputs=inputs, length_params=length_params, sampling_params=sampling_params)

@batch
def triton_infer_fn(self, **inputs: np.ndarray):
"""Triton server inference function that actually runs the model"""
input_strings = str_ndarray2list(inputs.pop("prompts"))
sampling_params = self._sampling_params_from_triton_inputs(**inputs)
length_params = self._length_params_from_triton_inputs(**inputs)

model_output = self.model.generate(
inputs=input_strings, length_params=length_params, sampling_params=sampling_params
)
model_output = self.generate(input_strings, length_params, sampling_params)
'''
model_output['sentences'] will be a list of strings (one per prompt)
other fields will either be a list of lists (tokens, for example)
Expand Down
100 changes: 88 additions & 12 deletions nemo/deploy/nlp/query_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,99 @@ def __init__(self, url, model_name):
self.url = url
self.model_name = model_name

@abstractmethod

class NemoQueryLLMPyTorch(NemoQueryLLMBase):
"""
Sends a query to Triton for LLM inference
Example:
from nemo.deploy import NemoTritonQueryLLMPyTorch
nq = NemoTritonQueryLLMPyTorch(url="localhost", model_name="GPT-2B")
prompts = ["hello, testing GPT inference", "another GPT inference test?"]
output = nq.query_llm(
prompts=prompts,
max_length=100,
top_k=1,
top_p=0.0,
temperature=0.0,
)
print("prompts: ", prompts)
"""

def __init__(self, url, model_name):
super().__init__(
url=url,
model_name=model_name,
)

# these arguments are explicitly defined in order to make it clear to user what they can pass
# names and optionality should exactly match the get_triton_input() results for MegatronGPTDeployable
def query_llm(
self,
prompts,
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_len=512,
top_k=1,
top_p=0.0,
temperature=1.0,
random_seed=None,
task_id=None,
lora_uids=None,
use_greedy: bool = None,
temperature: float = None,
top_k: int = None,
top_p: float = None,
repetition_penalty: float = None,
add_BOS: bool = None,
all_probs: bool = None,
compute_logprob: bool = None,
end_strings=None,
min_length: int = None,
max_length: int = None,
init_timeout=60.0,
):
pass
"""
Query the Triton server synchronously and return a list of responses.
Args:
prompts (List(str)): list of sentences.
use_greedy (bool): use greedy sampling, effectively the same as top_k=1
temperature (float): A parameter of the softmax function, which is the last layer in the network.
top_k (int): limits us to a certain number (K) of the top tokens to consider.
top_p (float): limits us to the top tokens within a certain probability mass (p).
repetition_penalty (float): penalty applied to repeated sequences, 1.0 means no penalty.
add_BOS (bool): whether or not to add a BOS (beginning of sentence) token.
all_probs (bool): when using compute_logprob, returns probabilities for all tokens in vocabulary.
compute_logprob (bool): get back probabilities of all tokens in the sequence.
end_strings (List(str)): list of strings which will terminate generation when they appear in the output.
min_length (int): min generated tokens.
max_length (int): max generated tokens.
init_timeout (flat): timeout for the connection.
"""
prompts = str_list2numpy(prompts)
inputs = {
"prompts": prompts,
}
if use_greedy is not None:
inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_)
if temperature is not None:
inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
if top_k is not None:
inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
if top_p is not None:
inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
if repetition_penalty is not None:
inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single)
if add_BOS is not None:
inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_)
if all_probs is not None:
inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_)
if compute_logprob is not None:
inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_)
if end_strings is not None:
inputs["end_strings"] = str_list2numpy(end_strings)
if min_length is not None:
inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_)
if max_length is not None:
inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_)

with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
result_dict = client.infer_batch(**inputs)
return result_dict


class NemoQueryLLM(NemoQueryLLMBase):
Expand Down
103 changes: 103 additions & 0 deletions scripts/deploy/nlp/deploy_inframework_triton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import sys

from nemo.deploy import DeployPyTriton

LOGGER = logging.getLogger("NeMo")

megatron_llm_supported = True
try:
from nemo.deploy.nlp import MegatronLLMDeployable
except Exception as e:
LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}")
megatron_llm_supported = False


def get_args(argv):
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=f"Deploy nemo models to Triton",
)
parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
parser.add_argument(
"-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
)
parser.add_argument(
"-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
)
parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
args = parser.parse_args(argv)
return args


def get_nemo_deployable(args):
if args.nemo_checkpoint is None:
raise ValueError("In-Framework deployment requires a .nemo checkpoint")

return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)


def nemo_deploy(argv):
args = get_args(argv)

if args.debug_mode:
loglevel = logging.DEBUG
else:
loglevel = logging.INFO

LOGGER.setLevel(loglevel)
LOGGER.info("Logging level set to {}".format(loglevel))
LOGGER.info(args)

if not megatron_llm_supported:
raise ValueError("MegatronLLMDeployable is not supported in this environment.")
triton_deployable = get_nemo_deployable(args)

try:
nm = DeployPyTriton(
model=triton_deployable,
triton_model_name=args.triton_model_name,
triton_model_version=args.triton_model_version,
max_batch_size=args.max_batch_size,
port=args.triton_port,
address=args.triton_http_address,
)

LOGGER.info("Triton deploy function will be called.")
nm.deploy()
except Exception as error:
LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
return

try:
LOGGER.info("Model serving on Triton is will be started.")
nm.serve()
except Exception as error:
LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
return

LOGGER.info("Model serving will be stopped.")
nm.stop()


if __name__ == '__main__':
nemo_deploy(sys.argv[1:])
83 changes: 83 additions & 0 deletions scripts/deploy/nlp/query_inframework.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import sys

from nemo.deploy.nlp.query_llm import NemoQueryLLMPyTorch


def get_args(argv):
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=f"Queries Triton server running an in-framework Nemo model",
)
parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
prompt_group = parser.add_mutually_exclusive_group(required=True)
prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt")
prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from")
parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")

args = parser.parse_args(argv)
return args


def query_llm(
url,
model_name,
prompts,
max_output_len=128,
top_k=1,
top_p=0.0,
temperature=1.0,
init_timeout=60.0,
):
nemo_query = NemoQueryLLMPyTorch(url, model_name)
return nemo_query.query_llm(
prompts=prompts,
max_length=max_output_len,
top_k=top_k,
top_p=top_p,
temperature=temperature,
init_timeout=init_timeout,
)


def query(argv):
args = get_args(argv)

if args.prompt_file is not None:
with open(args.prompt_file, "r") as f:
args.prompt = f.read()

outputs = query_llm(
url=args.url,
model_name=args.model_name,
prompts=[args.prompt],
max_output_len=args.max_output_len,
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
init_timeout=args.init_timeout,
)
print(outputs["sentences"][0][0])


if __name__ == '__main__':
query(sys.argv[1:])
4 changes: 2 additions & 2 deletions tests/deploy/nemo_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
run_export_tests = True
try:
from nemo.deploy import DeployPyTriton
from nemo.deploy.nlp import NemoQueryLLM
from nemo.deploy.nlp import NemoQueryLLM, NemoQueryLLMPyTorch
from nemo.export import TensorRTLLM
except Exception as e:
run_export_tests = False
Expand Down Expand Up @@ -140,7 +140,7 @@ def run_in_framework_inference(
)
nm.deploy()
nm.run()
nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name)

output_deployed = nq.query_llm(
prompts=prompt,
Expand Down
Loading

0 comments on commit faf2e7c

Please sign in to comment.