Skip to content

Commit

Permalink
Quality (#30)
Browse files Browse the repository at this point in the history
* Ran ruff for quality

* Again + isort

* Add quality workflow on PR

* typo

* Let's not use extras while optimum-nvidia is not pip installable

* Valid syntax

* More quality

* Attempt to understand why ruff is not so happy right now

* Format

* Disable ruff verbose
  • Loading branch information
mfuntowicz committed Dec 12, 2023
1 parent 77c5fa5 commit 4034bbf
Show file tree
Hide file tree
Showing 34 changed files with 371 additions and 319 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/pr_quality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Run code quality checks

on:
pull_request:
branches:
- main
push:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
check_code_quality:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.9"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install black ruff isort
- name: Check quality
run: |
ruff check examples tests src scripts
ruff format examples tests src scripts --check
37 changes: 27 additions & 10 deletions examples/text-generation/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,33 @@
from pathlib import Path

from transformers import AutoTokenizer

from optimum.nvidia import setup_logging

# Setup logging

# Setup logging needs to happen before importing TRT ...
setup_logging(False)

# ruff: disable=E402
from optimum.nvidia import TensorRTEngineBuilder, TensorRTForCausalLM
from optimum.nvidia.models.llama import LlamaWeightAdapter
from optimum.nvidia.utils.cli import *
from optimum.nvidia.utils.cli import (
postprocess_quantization_parameters,
register_common_model_topology_args,
register_optimization_profiles_args,
register_quantization_args,
)


LOGGER = getLogger(__name__)


if __name__ == '__main__':
if __name__ == "__main__":
parser = ArgumentParser("🤗 TensorRT-LLM Llama implementation")
parser.add_argument("--hub-token", type=str, help="Hugging Face Hub Token to retrieve private weights.")
register_common_model_topology_args(parser)
register_optimization_profiles_args(parser)
register_quantization_args(parser) # Inject params.quantization_config
register_triton_server_args(parser)

parser.add_argument("model", type=str, help="The model's id or path to use.")
parser.add_argument("output", type=Path, help="Path to store generated TensorRT engine.")
Expand All @@ -57,17 +65,22 @@
LOGGER.info(f"Exporting {args.model} to TensorRT-LLM engine at {args.output}")
if args.hub_token is not None:
from huggingface_hub import login
login(args.hub_token, )

login(
args.hub_token,
)

tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

# Define the target engine details
builder = TensorRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter) \
.to(args.dtype) \
.shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node) \
.with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens) \
builder = (
TensorRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter)
.to(args.dtype)
.shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node)
.with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens)
.with_sampling_strategy(args.max_beam_width)
)

# Check if we need to collect calibration samples
if args.has_quantization_step:
Expand Down Expand Up @@ -103,6 +116,10 @@
max_new_tokens=args.max_new_tokens,
)

print(tokenizer.decode(generated.squeeze().tolist(), ))
print(
tokenizer.decode(
generated.squeeze().tolist(),
)
)

print(f"TRTLLM engines have been saved at {args.output}.")
27 changes: 27 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[tool.ruff]
# Never enforce `E501` (line length violations).
ignore = ["C901", "E501", "E741", "F402", "F823" ]
select = ["C", "E", "F", "I", "W"]
line-length = 119

# Ignore import violations in all `__init__.py` files.
[tool.ruff.per-file-ignores]
"__init__.py" = ["E402", "F401", "F403", "F811"]
"examples/*.py" = ["E402"]

[tool.ruff.isort]
lines-after-imports = 2
known-first-party = ["optimum"]

[tool.ruff.format]
# Like Black, use double quotes for strings.
quote-style = "double"

# Like Black, indent with spaces, rather than tabs.
indent-style = "space"

# Like Black, respect magic trailing commas.
skip-magic-trailing-comma = false

# Like Black, automatically detect the appropriate line ending.
line-ending = "auto"
24 changes: 16 additions & 8 deletions scripts/benchmark_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import numpy as np
import torch
from huggingface_hub import login
from tqdm import trange
from transformers import pipeline as raw_pipeline

from optimum.nvidia.pipelines import pipeline
from transformers import pipeline as raw_pipeline
from tqdm import trange


def get_transformers_pipeline(args: Namespace):
Expand All @@ -17,7 +17,8 @@ def get_transformers_pipeline(args: Namespace):
model_kwargs={
"device_map": "balanced",
"max_memory": {0: "20GiB", "cpu": "64GiB"},
})
},
)


def get_trtllm_pipeline(args: Namespace):
Expand All @@ -31,17 +32,19 @@ def get_trtllm_pipeline(args: Namespace):
tp=args.tp,
pp=args.pp,
gpus_per_node=args.gpus_per_node,
world_size=args.world_size
world_size=args.world_size,
)


def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
tokens = ["I"] * length
tokens = " ".join(tokens)
if batch == 1:
return tokens
return [tokens] * batch

if __name__ == '__main__':

if __name__ == "__main__":
parser = ArgumentParser("Hugging Face Optimum-Nvidia Pipelines Benchmarking tool")
parser.add_argument("--token", type=str, help="Hugging Face Hub token to authenticate the request.")
parser.add_argument("--warmup", type=int, default=10, help="Number of warmup runs before collecting metrics.")
Expand All @@ -57,11 +60,16 @@ def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
parser.add_argument("--pp", type=int, default=1, help="Degree of pipeline parallelism to apply.")
parser.add_argument("--gpus-per-node", type=int, default=1, help="Number of GPUs per node.")
parser.add_argument("--world-size", type=int, help="Total number of GPUs over all the node.")
parser.add_argument("--time-to-first-token", action="store_true",
help="Indicate we will only generating a single token.")
parser.add_argument(
"--time-to-first-token", action="store_true", help="Indicate we will only generating a single token."
)
parser.add_argument("model", type=str, help="Model's id to use for the benchmark.")

args = parser.parse_args()
args.world_size = args.world_size or args.gpus_per_node

if not args.world_size:
args.world_size = args.gpus_per_node

if args.token:
login(args.token)
Expand Down Expand Up @@ -104,7 +112,7 @@ def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
f"(+/- {latencies.std().astype(np.uint64)})"
)
else:
num_tokens = (args.batch_size * args.output_length)
num_tokens = args.batch_size * args.output_length
tokens_per_sec = num_tokens / (latencies / 1e3)
print(
"Throughput: "
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
# limitations under the License.
import re
from distutils.core import setup

from setuptools import find_namespace_packages


# Ensure we match the version set in optimum/neuron/version.py
# Ensure we match the version set in optimum/nvidia/version.py
filepath = "src/optimum/nvidia/version.py"
try:
with open(filepath) as version_file:
Expand Down
11 changes: 6 additions & 5 deletions src/optimum/nvidia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@
OPTIMUM_NVIDIA_CONFIG_FILE = f"{OPTIMUM_NVIDIA_CONFIG_FILENAME}.json"


from .logging import DEFAULT_LOGGING_FMT, setup_logging
from .version import __version__, VERSION

from .builder import TensorRTEngineBuilder
from .runtime import TensorRTPreTrainedModel, TensorRTForCausalLM
from .logging import DEFAULT_LOGGING_FMT, setup_logging
from .models import AutoModelForCausalLM
# from .pipelines import pipeline
from .runtime import TensorRTForCausalLM, TensorRTPreTrainedModel
from .version import VERSION, __version__


# from .pipelines import pipeline
Loading

0 comments on commit 4034bbf

Please sign in to comment.