huggingface · muellerzr · Jan 25, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/src/accelerate/inference.py b/src/accelerate/inference.py
@@ -9,7 +9,9 @@
 from .utils import (
     calculate_maximum_sizes,
     convert_bytes,
+    ignorant_find_batch_size,
     infer_auto_device_map,
+    pad_input_tensors,
     send_to_device,
 )
 
@@ -42,6 +44,19 @@ def generate_device_map(model, num_processes: int = 1, no_split_module_classes=N
     return device_map
 
 
+def find_pippy_batch_size(args, kwargs):
+    found_batch_size = None
+    for arg in args:
+        found_batch_size = ignorant_find_batch_size(arg)
+        if found_batch_size is not None:
+            break
+    for kwarg in kwargs.values():
+        found_batch_size = ignorant_find_batch_size(kwarg)
+        if found_batch_size is not None:
+            break
+    return found_batch_size
+
+
 def build_pipeline(model, split_points, args, kwargs) -> PipelineStage:
     """
     Attaches the split points to the model based on `self.device_map` and generates a `PipelineStage`. Requires passing
@@ -50,6 +65,10 @@ def build_pipeline(model, split_points, args, kwargs) -> PipelineStage:
     # We need to annotate the split points in the model for PiPPy
     state = PartialState()
     annotate_split_points(model, {split_point: PipeSplitWrapper.SplitPoint.BEGINNING for split_point in split_points})
+    found_batch_size = find_pippy_batch_size(args, kwargs)
+    if found_batch_size != state.num_processes:
+        args = pad_input_tensors(args, found_batch_size, state.num_processes)
+        kwargs = pad_input_tensors(kwargs, found_batch_size, state.num_processes)
     pipe = Pipe.from_tracing(model, num_chunks=state.num_processes, example_args=args, example_kwargs=kwargs)
     stage = PipelineStage(pipe, state.local_process_index, device=state.device)
 
@@ -59,9 +78,17 @@ def build_pipeline(model, split_points, args, kwargs) -> PipelineStage:
 def pippy_forward(forward, *args, **kwargs):
     state = PartialState()
     output = None
+
     if state.num_processes == 1:
         output = forward(*args, **kwargs)
     elif state.is_local_main_process:
+        found_batch_size = find_pippy_batch_size(args, kwargs)
+        if found_batch_size is None:
+            raise ValueError("Could not find batch size from args or kwargs")
+        else:
+            if found_batch_size != state.num_processes:
+                args = pad_input_tensors(args, found_batch_size, state.num_processes)
+                kwargs = pad_input_tensors(kwargs, found_batch_size, state.num_processes)
         forward(*args, **kwargs)
     elif state.is_last_process:
         output = forward()

diff --git a/src/accelerate/test_utils/scripts/external_deps/test_pippy.py b/src/accelerate/test_utils/scripts/external_deps/test_pippy.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+from torchvision.models import resnet34
 from transformers import (
     BertConfig,
     BertForMaskedLM,
@@ -34,24 +35,28 @@
 }
 
 
-def get_model_and_data(model_name, device, num_processes: int = 2):
+def get_model_and_data_for_text(model_name, device, num_processes: int = 2):
     initializer, config, seq_len = model_to_config[model_name]
-    config = config()
-    model = initializer(config)
+    config_args = {}
+    # Eventually needed for batch inference tests on gpt-2 when bs != 1
+    # if model_name == "gpt2":
+    #     config_args["pad_token_id"] = 0
+    model_config = config(**config_args)
+    model = initializer(model_config)
     return model, torch.randint(
         low=0,
-        high=config.vocab_size,
+        high=model_config.vocab_size,
         size=(num_processes, seq_len),
         device=device,
         dtype=torch.int64,
         requires_grad=False,
     )
 
 
-def test_gpt2():
+def test_gpt2(batch_size: int = 2):
     set_seed(42)
     state = PartialState()
-    model, inputs = get_model_and_data("gpt2", "cpu", state.num_processes)
+    model, inputs = get_model_and_data_for_text("gpt2", "cpu", batch_size)
     model = prepare_pippy(model, example_args=(inputs,), no_split_module_classes=model._no_split_modules)
     # For inference args need to be a tuple
     inputs = inputs.to("cuda")
@@ -64,10 +69,10 @@ def test_gpt2():
         assert output is not None, "Output was not generated in the last process!"
 
 
-def test_t5():
+def test_t5(batch_size: int = 2):
     set_seed(42)
     state = PartialState()
-    model, inputs = get_model_and_data("t5", "cpu", state.num_processes)
+    model, inputs = get_model_and_data_for_text("t5", "cpu", batch_size)
     example_inputs = {"input_ids": inputs, "decoder_input_ids": inputs}
     model = prepare_pippy(
         model,
@@ -85,13 +90,41 @@ def test_t5():
         assert output is not None, "Output was not generated in the last process!"
 
 
+def test_resnet(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model = resnet34()
+    input_tensor = torch.rand(batch_size, 3, 224, 224)
+    model = prepare_pippy(
+        model,
+        example_args=(input_tensor,),
+    )
+    inputs = send_to_device(input_tensor, "cuda:0")
+    with torch.no_grad():
+        output = model(inputs)
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+
+
 if __name__ == "__main__":
     state = PartialState()
     state.print("Testing pippy integration...")
     if state.distributed_type == DistributedType.MULTI_GPU:
         state.print("Testing GPT2...")
         test_gpt2()
+        # Issue: When modifying the tokenizer for batch GPT2 inference, there's an issue
+        # due to references
+        # NameError: cannot access free variable 'chunk_args_list' where it is not associated with a value in enclosing scope
+        # test_gpt2(3)
         state.print("Testing T5...")
         test_t5()
+        test_t5(1)
+        test_t5(3)
+        state.print("Testing CV model...")
+        test_resnet()
+        test_resnet(3)
     else:
         print("Less than two GPUs found, not running tests!")
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -125,12 +125,14 @@
     gather_object,
     get_data_structure,
     honor_type,
+    ignorant_find_batch_size,
     initialize_tensors,
     is_namedtuple,
     is_tensor_information,
     is_torch_tensor,
     listify,
     pad_across_processes,
+    pad_input_tensors,
     recursively_apply,
     reduce,
     send_to_device,

diff --git a/src/accelerate/utils/operations.py b/src/accelerate/utils/operations.py
@@ -248,6 +248,23 @@ def find_batch_size(data):
     return data.shape[0]
 
 
+def ignorant_find_batch_size(data):
+    """
+    Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised
+
+    Args:
+        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.
+
+    Returns:
+        `int`: The batch size.
+    """
+    try:
+        return find_batch_size(data)
+    except (ValueError, TypeError):
+        pass
+    return None
+
+
 def listify(data):
     """
     Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.
@@ -591,6 +608,46 @@ def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
     )
 
 
+def pad_input_tensors(tensor, batch_size, num_processes, dim=0):
+    """
+    Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.
+
+    New tensors are just the last input repeated.
+
+    E.g.:
+      Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])
+
+    """
+
+    def _pad_input_tensors(tensor, batch_size, num_processes, dim=0):
+        remainder = batch_size // num_processes
+        last_inputs = batch_size - (remainder * num_processes)
+        if batch_size // num_processes == 0:
+            to_pad = num_processes - batch_size
+        else:
+            to_pad = num_processes - (batch_size // num_processes)
+        # In the rare case that `to_pad` is negative,
+        # we need to pad the last inputs - the found `to_pad`
+        if last_inputs > to_pad & to_pad < 1:
+            to_pad = last_inputs - to_pad
+        old_size = tensor.shape
+        new_size = list(old_size)
+        new_size[0] = batch_size + to_pad
+        new_tensor = tensor.new_zeros(tuple(new_size))
+        indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
+        new_tensor[indices] = tensor
+        return new_tensor
+
+    return recursively_apply(
+        _pad_input_tensors,
+        tensor,
+        error_on_other_type=True,
+        batch_size=batch_size,
+        num_processes=num_processes,
+        dim=dim,
+    )
+
+
 @verify_operation
 def reduce(tensor, reduction="mean", scale=1.0):
     """

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import os
 import pickle
 import tempfile
@@ -34,6 +33,7 @@
     find_device,
     listify,
     pad_across_processes,
+    pad_input_tensors,
     patch_environment,
     recursively_apply,
     save,
@@ -237,3 +237,68 @@ def test_pad_across_processes(self):
         with self.assertWarns(CannotPadNestedTensorWarning):
             nt2 = pad_across_processes(nt)
         self.assertIs(nt, nt2)
+
+    def test_slice_and_concatenate(self):
+        # First base case: 2 processes, batch size of 1
+        num_processes = 2
+        batch_size = 1
+        batch = torch.rand(batch_size, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 2 items now
+        assert result.shape == torch.Size([2, 4])
+
+        # Second base case: 2 processes, batch size of 3
+        num_processes = 2
+        batch_size = 3
+        batch = torch.rand(batch_size, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 4 items now
+        assert result.shape == torch.Size([4, 4])
+
+        # Third base case: 3 processes, batch size of 4
+        num_processes = 3
+        batch_size = 4
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 6 items now
+        assert result.shape == torch.Size([6, 4, 4])
+
+        # Fourth base case: 4 processes, batch size of 3
+        num_processes = 4
+        batch_size = 3
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 4 items now
+        assert result.shape == torch.Size([4, 4, 4])
+
+        # Fifth base case: 6 processes, batch size of 4
+        num_processes = 6
+        batch_size = 4
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 6 items now
+        assert result.shape == torch.Size([6, 4, 4])
+
+        # Sixth base case: 6 processes, batch size of 1
+        num_processes = 6
+        batch_size = 1
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 6 items now
+        assert result.shape == torch.Size([6, 4, 4])
+
+        # Seventh base case: 6 processes, batch size of 2
+        num_processes = 6
+        batch_size = 2
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 6 items now
+        assert result.shape == torch.Size([6, 4, 4])
+
+        # Eighth base case: 6 processes, batch size of 61
+        num_processes = 6
+        batch_size = 61
+        batch = torch.rand(batch_size, 4, 4)
+        result = pad_input_tensors(batch, batch_size, num_processes)
+        # We should expect there to be 6 items now
+        assert result.shape == torch.Size([66, 4, 4])