Quality (#30)

* Ran ruff for quality * Again + isort * Add quality workflow on PR * typo * Let's not use extras while optimum-nvidia is not pip installable * Valid syntax * More quality * Attempt to understand why ruff is not so happy right now * Format * Disable ruff verbose
huggingface · Dec 12, 2023 · 4034bbf · 4034bbf
1 parent 77c5fa5
commit 4034bbf
Show file tree

Hide file tree

Showing 34 changed files with 371 additions and 319 deletions.
diff --git a/.github/workflows/pr_quality.yml b/.github/workflows/pr_quality.yml
@@ -0,0 +1,31 @@
+name: Run code quality checks
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install black ruff isort
+      - name: Check quality
+        run: |
+          ruff check examples tests src scripts
+          ruff format examples tests src scripts --check
diff --git a/examples/text-generation/llama.py b/examples/text-generation/llama.py
@@ -18,25 +18,33 @@
 from pathlib import Path
 
 from transformers import AutoTokenizer
+
 from optimum.nvidia import setup_logging
 
-# Setup logging
+
+# Setup logging needs to happen before importing TRT ...
 setup_logging(False)
 
+# ruff: disable=E402
 from optimum.nvidia import TensorRTEngineBuilder, TensorRTForCausalLM
 from optimum.nvidia.models.llama import LlamaWeightAdapter
-from optimum.nvidia.utils.cli import *
+from optimum.nvidia.utils.cli import (
+    postprocess_quantization_parameters,
+    register_common_model_topology_args,
+    register_optimization_profiles_args,
+    register_quantization_args,
+)
+
 
 LOGGER = getLogger(__name__)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = ArgumentParser("🤗 TensorRT-LLM Llama implementation")
     parser.add_argument("--hub-token", type=str, help="Hugging Face Hub Token to retrieve private weights.")
     register_common_model_topology_args(parser)
     register_optimization_profiles_args(parser)
     register_quantization_args(parser)  # Inject params.quantization_config
-    register_triton_server_args(parser)
 
     parser.add_argument("model", type=str, help="The model's id or path to use.")
     parser.add_argument("output", type=Path, help="Path to store generated TensorRT engine.")
@@ -57,17 +65,22 @@
     LOGGER.info(f"Exporting {args.model} to TensorRT-LLM engine at {args.output}")
     if args.hub_token is not None:
         from huggingface_hub import login
-        login(args.hub_token, )
+
+        login(
+            args.hub_token,
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
     tokenizer.pad_token = tokenizer.eos_token
 
     # Define the target engine details
-    builder = TensorRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter) \
-        .to(args.dtype) \
-        .shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node) \
-        .with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens) \
+    builder = (
+        TensorRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter)
+        .to(args.dtype)
+        .shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node)
+        .with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens)
         .with_sampling_strategy(args.max_beam_width)
+    )
 
     # Check if we need to collect calibration samples
     if args.has_quantization_step:
@@ -103,6 +116,10 @@
             max_new_tokens=args.max_new_tokens,
         )
 
-        print(tokenizer.decode(generated.squeeze().tolist(), ))
+        print(
+            tokenizer.decode(
+                generated.squeeze().tolist(),
+            )
+        )
 
     print(f"TRTLLM engines have been saved at {args.output}.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,27 @@
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "F402", "F823" ]
+select = ["C", "E", "F", "I", "W"]
+line-length = 119
+
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"examples/*.py" = ["E402"]
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["optimum"]
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/scripts/benchmark_pipelines.py b/scripts/benchmark_pipelines.py
@@ -4,10 +4,10 @@
 import numpy as np
 import torch
 from huggingface_hub import login
+from tqdm import trange
+from transformers import pipeline as raw_pipeline
 
 from optimum.nvidia.pipelines import pipeline
-from transformers import pipeline as raw_pipeline
-from tqdm import trange
 
 
 def get_transformers_pipeline(args: Namespace):
@@ -17,7 +17,8 @@ def get_transformers_pipeline(args: Namespace):
         model_kwargs={
             "device_map": "balanced",
             "max_memory": {0: "20GiB", "cpu": "64GiB"},
-        })
+        },
+    )
 
 
 def get_trtllm_pipeline(args: Namespace):
@@ -31,17 +32,19 @@ def get_trtllm_pipeline(args: Namespace):
         tp=args.tp,
         pp=args.pp,
         gpus_per_node=args.gpus_per_node,
-        world_size=args.world_size
+        world_size=args.world_size,
     )
 
+
 def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
     tokens = ["I"] * length
     tokens = " ".join(tokens)
     if batch == 1:
         return tokens
     return [tokens] * batch
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = ArgumentParser("Hugging Face Optimum-Nvidia Pipelines Benchmarking tool")
     parser.add_argument("--token", type=str, help="Hugging Face Hub token to authenticate the request.")
     parser.add_argument("--warmup", type=int, default=10, help="Number of warmup runs before collecting metrics.")
@@ -57,11 +60,16 @@ def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
     parser.add_argument("--pp", type=int, default=1, help="Degree of pipeline parallelism to apply.")
     parser.add_argument("--gpus-per-node", type=int, default=1, help="Number of GPUs per node.")
     parser.add_argument("--world-size", type=int, help="Total number of GPUs over all the node.")
-    parser.add_argument("--time-to-first-token", action="store_true",
-                        help="Indicate we will only generating a single token.")
+    parser.add_argument(
+        "--time-to-first-token", action="store_true", help="Indicate we will only generating a single token."
+    )
     parser.add_argument("model", type=str, help="Model's id to use for the benchmark.")
 
     args = parser.parse_args()
+    args.world_size = args.world_size or args.gpus_per_node
+
+    if not args.world_size:
+        args.world_size = args.gpus_per_node
 
     if args.token:
         login(args.token)
@@ -104,7 +112,7 @@ def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]:
             f"(+/- {latencies.std().astype(np.uint64)})"
         )
     else:
-        num_tokens = (args.batch_size * args.output_length)
+        num_tokens = args.batch_size * args.output_length
         tokens_per_sec = num_tokens / (latencies / 1e3)
         print(
             "Throughput: "

diff --git a/setup.py b/setup.py
@@ -14,10 +14,11 @@
 #  limitations under the License.
 import re
 from distutils.core import setup
+
 from setuptools import find_namespace_packages
 
 
-# Ensure we match the version set in optimum/neuron/version.py
+# Ensure we match the version set in optimum/nvidia/version.py
 filepath = "src/optimum/nvidia/version.py"
 try:
     with open(filepath) as version_file:

diff --git a/src/optimum/nvidia/__init__.py b/src/optimum/nvidia/__init__.py
@@ -22,10 +22,11 @@
 OPTIMUM_NVIDIA_CONFIG_FILE = f"{OPTIMUM_NVIDIA_CONFIG_FILENAME}.json"
 
 
-from .logging import DEFAULT_LOGGING_FMT, setup_logging
-from .version import __version__, VERSION
-
 from .builder import TensorRTEngineBuilder
-from .runtime import TensorRTPreTrainedModel, TensorRTForCausalLM
+from .logging import DEFAULT_LOGGING_FMT, setup_logging
 from .models import AutoModelForCausalLM
-# from .pipelines import pipeline
+from .runtime import TensorRTForCausalLM, TensorRTPreTrainedModel
+from .version import VERSION, __version__
+
+
+# from .pipelines import pipeline