huggingface · muellerzr · Feb 6, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/src/accelerate/inference.py b/src/accelerate/inference.py
@@ -0,0 +1,155 @@
+import math
+from types import MethodType
+
+from pippy.IR import Pipe, PipeSplitWrapper, annotate_split_points
+from pippy.PipelineStage import PipelineStage
+
+from .state import PartialState
+from .utils import (
+    calculate_maximum_sizes,
+    convert_bytes,
+    ignorant_find_batch_size,
+    infer_auto_device_map,
+    is_pippy_available,
+    pad_input_tensors,
+    send_to_device,
+)
+
+
+def generate_device_map(model, num_processes: int = 1, no_split_module_classes=None, max_memory: dict = None):
+    """
+    Calculates the device map for `model` with an offset for PiPPy
+    """
+    if num_processes == 1:
+        return infer_auto_device_map(model, no_split_module_classes=no_split_module_classes, clean_result=False)
+    if max_memory is None:
+        model_size, shared = calculate_maximum_sizes(model)
+
+        # Split into `n` chunks for each GPU
+        memory = (model_size + shared[0]) / num_processes
+        memory = convert_bytes(memory)
+        value, ending = memory.split(" ")
+
+        # Add a chunk to deal with potential extra shared memory instances
+        memory = math.ceil(float(value)) * 1.1
+        memory = f"{memory} {ending}"
+        max_memory = {i: memory for i in range(num_processes)}
+    device_map = infer_auto_device_map(
+        model,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
+        clean_result=False,
+    )
+    return device_map
+
+
+def find_pippy_batch_size(args, kwargs):
+    found_batch_size = None
+    for arg in args:
+        found_batch_size = ignorant_find_batch_size(arg)
+        if found_batch_size is not None:
+            break
+    for kwarg in kwargs.values():
+        found_batch_size = ignorant_find_batch_size(kwarg)
+        if found_batch_size is not None:
+            break
+    return found_batch_size
+
+
+def build_pipeline(model, split_points, args, kwargs, num_chunks) -> PipelineStage:
+    """
+    Attaches the split points to the model based on `self.device_map` and generates a `PipelineStage`. Requires passing
+    in needed `args` and `kwargs` as the model needs on the CPU.
+
+    Users can pass in custom `num_chunks` as an optional hyper-parameter. By default will use
+    `AcceleratorState.num_processes`
+    """
+    # We need to annotate the split points in the model for PiPPy
+    state = PartialState()
+    annotate_split_points(model, {split_point: PipeSplitWrapper.SplitPoint.BEGINNING for split_point in split_points})
+    found_batch_size = find_pippy_batch_size(args, kwargs)
+    if found_batch_size != num_chunks:
+        args = pad_input_tensors(args, found_batch_size, num_chunks)
+        kwargs = pad_input_tensors(kwargs, found_batch_size, num_chunks)
+    pipe = Pipe.from_tracing(model, num_chunks=num_chunks, example_args=args, example_kwargs=kwargs)
+    stage = PipelineStage(pipe, state.local_process_index, device=state.device)
+
+    return stage
+
+
+def pippy_forward(forward, *args, **kwargs):
+    state = PartialState()
+    output = None
+
+    if state.num_processes == 1:
+        output = forward(*args, **kwargs)
+    elif state.is_local_main_process:
+        found_batch_size = find_pippy_batch_size(args, kwargs)
+        if found_batch_size is None:
+            raise ValueError("Could not find batch size from args or kwargs")
+        else:
+            if found_batch_size != state.num_processes:
+                args = pad_input_tensors(args, found_batch_size, state.num_processes)
+                kwargs = pad_input_tensors(kwargs, found_batch_size, state.num_processes)
+        forward(*args, **kwargs)
+    elif state.is_last_process:
+        output = forward()
+    else:
+        forward()
+    return output
+
+
+def prepare_pippy(
+    model, split_points="auto", no_split_module_classes=None, example_args=(), example_kwargs={}, num_chunks=None
+):
+    """
+    Wraps `model` for PipelineParallelism
+
+    Args:
+        model (`torch.nn.Module`):
+            A model we want to split for pipeline-parallel inference
+        split_points (`str`, defaults to 'auto'):
+            How to generate the split points and chunk the model across each GPU. 'auto' will find the best balanced
+            split given any model.
+        no_split_module_classes (`List[str]`):
+            A list of class names for layers we don't want to be split.
+        example_args (tuple of `torch.Tensor`):
+            The expected inputs for the model that uses order-based inputs. Recommended to use this method if possible.
+        example_kwargs (dict of `torch.Tensor`)
+            The expected inputs for the model that uses dictionary-based inputs. This is a *highly* limiting structure
+            that requires the same keys be present at *all* inference calls. Not recommended unless the prior condition
+            is true for all cases.
+        num_chunks (`int`):
+            The number of different stages the Pipeline will have. By default it will assign one chunk per GPU, but
+            this can be tuned and played with. In general one should have num_chunks > num_gpus.
+    """
+    if not is_pippy_available():
+        raise ImportError(
+            "`pippy` was not found to be installed on your system. Please "
+            "install using `pip install git+https://github.com/pytorch/PiPPy"
+        )
+    state = PartialState()
+    example_args = send_to_device(example_args, "cpu")
+    example_kwargs = send_to_device(example_kwargs, "cpu")
+    if num_chunks is None:
+        num_chunks = state.num_processes
+    if split_points == "auto":
+        device_map = generate_device_map(model, num_chunks, no_split_module_classes=no_split_module_classes)
+        split_points = []
+        for i in range(1, num_chunks):
+            split_points.append(next(k for k, v in device_map.items() if v == i))
+    stage = build_pipeline(model, split_points, example_args, example_kwargs, num_chunks)
+    model._original_forward = model.forward
+    model._original_call = model.__call__
+    model.pippy_stage = stage
+    model.hf_split_points = split_points
+
+    def forward(*args, **kwargs):
+        return pippy_forward(stage.forward, *args, **kwargs)
+
+    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
+    # Note: creates an infinite recursion loop with `generate`
+    model_forward = MethodType(forward, model)
+    forward.__wrapped__ = model_forward
+    model.forward = forward
+    return model
diff --git a/src/accelerate/test_utils/__init__.py b/src/accelerate/test_utils/__init__.py
@@ -13,6 +13,7 @@
     require_multi_gpu,
     require_multi_xpu,
     require_non_cpu,
+    require_pippy,
     require_single_device,
     require_single_gpu,
     require_single_xpu,

diff --git a/src/accelerate/test_utils/scripts/external_deps/test_pippy.py b/src/accelerate/test_utils/scripts/external_deps/test_pippy.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torchvision.models import resnet34
+from transformers import (
+    BertConfig,
+    BertForMaskedLM,
+    GPT2Config,
+    GPT2ForSequenceClassification,
+    T5Config,
+    T5ForConditionalGeneration,
+)
+
+from accelerate import PartialState
+from accelerate.inference import prepare_pippy
+from accelerate.utils import DistributedType, send_to_device, set_seed
+
+
+model_to_config = {
+    "t5": (T5ForConditionalGeneration, T5Config, 1024),
+    "bert": (BertForMaskedLM, BertConfig, 512),
+    "gpt2": (GPT2ForSequenceClassification, GPT2Config, 1024),
+}
+
+
+def get_model_and_data_for_text(model_name, device, num_processes: int = 2):
+    initializer, config, seq_len = model_to_config[model_name]
+    config_args = {}
+    # Eventually needed for batch inference tests on gpt-2 when bs != 1
+    # if model_name == "gpt2":
+    #     config_args["pad_token_id"] = 0
+    model_config = config(**config_args)
+    model = initializer(model_config)
+    return model, torch.randint(
+        low=0,
+        high=model_config.vocab_size,
+        size=(num_processes, seq_len),
+        device=device,
+        dtype=torch.int64,
+        requires_grad=False,
+    )
+
+
+def test_gpt2(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model, inputs = get_model_and_data_for_text("gpt2", "cpu", batch_size)
+    model = prepare_pippy(model, example_args=(inputs,), no_split_module_classes=model._no_split_modules)
+    # For inference args need to be a tuple
+    inputs = inputs.to("cuda")
+    with torch.no_grad():
+        output = model(inputs)
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+
+
+def test_t5(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model, inputs = get_model_and_data_for_text("t5", "cpu", batch_size)
+    example_inputs = {"input_ids": inputs, "decoder_input_ids": inputs}
+    model = prepare_pippy(
+        model,
+        no_split_module_classes=model._no_split_modules,
+        example_kwargs=example_inputs,
+    )
+    # For inference args need to be a tuple
+    inputs = send_to_device(example_inputs, "cuda:0")
+    with torch.no_grad():
+        output = model(*inputs.values())
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+
+
+def test_resnet(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model = resnet34()
+    input_tensor = torch.rand(batch_size, 3, 224, 224)
+    model = prepare_pippy(
+        model,
+        example_args=(input_tensor,),
+    )
+    inputs = send_to_device(input_tensor, "cuda:0")
+    with torch.no_grad():
+        output = model(inputs)
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+
+
+if __name__ == "__main__":
+    state = PartialState()
+    state.print("Testing pippy integration...")
+    if state.distributed_type == DistributedType.MULTI_GPU:
+        state.print("Testing GPT2...")
+        test_gpt2()
+        # Issue: When modifying the tokenizer for batch GPT2 inference, there's an issue
+        # due to references
+        # NameError: cannot access free variable 'chunk_args_list' where it is not associated with a value in enclosing scope
+        # test_gpt2(3)
+        state.print("Testing T5...")
+        test_t5()
+        test_t5(1)
+        test_t5(3)
+        state.print("Testing CV model...")
+        test_resnet()
+        test_resnet(3)
+    else:
+        print("Less than two GPUs found, not running tests!")
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
@@ -40,6 +40,7 @@
     is_mps_available,
     is_npu_available,
     is_pandas_available,
+    is_pippy_available,
     is_tensorboard_available,
     is_timm_available,
     is_torch_version,
@@ -290,6 +291,13 @@ def require_pandas(test_case):
     return unittest.skipUnless(is_pandas_available(), "test requires pandas")(test_case)
 
 
+def require_pippy(test_case):
+    """
+    Decorator marking a test that requires pippy installed. These tests are skipped when pippy isn't installed
+    """
+    return unittest.skipUnless(is_pippy_available(), "test requires pippy")(test_case)
+
+
 _atleast_one_tracker_available = (
     any([is_wandb_available(), is_tensorboard_available()]) and not is_comet_ml_available()
 )

diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -70,6 +70,7 @@
     is_npu_available,
     is_pandas_available,
     is_peft_available,
+    is_pippy_available,
     is_rich_available,
     is_sagemaker_available,
     is_tensorboard_available,
@@ -126,12 +127,14 @@
     gather_object,
     get_data_structure,
     honor_type,
+    ignorant_find_batch_size,
     initialize_tensors,
     is_namedtuple,
     is_tensor_information,
     is_torch_tensor,
     listify,
     pad_across_processes,
+    pad_input_tensors,
     recursively_apply,
     reduce,
     send_to_device,

diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
@@ -38,12 +38,13 @@
 _torch_distributed_available = torch.distributed.is_available()
 
 
-def _is_package_available(pkg_name):
+def _is_package_available(pkg_name, metadata_name=None):
     # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
     package_exists = importlib.util.find_spec(pkg_name) is not None
     if package_exists:
         try:
-            _ = importlib.metadata.metadata(pkg_name)
+            # Some libraries have different names in the metadata
+            _ = importlib.metadata.metadata(pkg_name if metadata_name is None else metadata_name)
             return True
         except importlib.metadata.PackageNotFoundError:
             return False
@@ -73,15 +74,7 @@ def get_ccl_version():
 
 
 def is_msamp_available():
-    package_exists = importlib.util.find_spec("msamp") is not None
-    if package_exists:
-        try:
-            # MS-AMP has a different metadata name
-            _ = importlib.metadata.metadata("ms-amp")
-            return True
-        except importlib.metadata.PackageNotFoundError:
-            return False
-    return False
+    return _is_package_available("msamp", "ms-amp")
 
 
 def is_transformer_engine_available():
@@ -126,6 +119,14 @@ def is_deepspeed_available():
     return _is_package_available("deepspeed")
 
 
+def is_pippy_available():
+    package_exists = _is_package_available("pippy", "torchpippy")
+    if package_exists:
+        pippy_version = version.parse(importlib.metadata.version("torchpippy"))
+        return compare_versions(pippy_version, ">", "0.1.1")
+    return False
+
+
 def is_bf16_available(ignore_tpu=False):
     "Checks if bf16 is supported, optionally ignoring the TPU"
     if is_tpu_available():