Remove CUDA stream pool

kakaobrain · Sep 25, 2019 · 146b686 · 146b686
1 parent 1916ce2
commit 146b686
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 19 deletions.
diff --git a/torchgpipe/gpipe.py b/torchgpipe/gpipe.py
@@ -11,7 +11,6 @@
 from torchgpipe.batchnorm import DeferredBatchNorm
 from torchgpipe.microbatch import check, gather, scatter
 from torchgpipe.pipeline import Pipeline
-from torchgpipe.stream import AbstractStream, new_stream
 
 __all__ = ['GPipe']
 
@@ -229,8 +228,6 @@ def __init__(self,
         except BalanceError as exc:
             raise ValueError(recommend_torchgpipe_balancing(str(exc)))
 
-        self._copy_streams: List[List[AbstractStream]] = []
-
     def __len__(self) -> int:
         """Counts the length of the underlying sequential module."""
         return sum(len(p) for p in self.partitions)
@@ -285,13 +282,6 @@ def to(self, *args: Any, **kwargs: Any) -> 'GPipe':
 
         return super().to(*args, **kwargs)
 
-    def _ensure_copy_streams(self) -> List[List[AbstractStream]]:
-        if not self._copy_streams:
-            for device in self.devices:
-                self._copy_streams.append([new_stream(device) for _ in range(self.chunks)])
-
-        return self._copy_streams
-
     def forward(self, input: TensorOrTensors) -> TensorOrTensors:  # type: ignore
         """:class:`GPipe` is a fairly transparent module wrapper. It doesn't
         modify the input and output signature of the underlying module. But
@@ -315,9 +305,6 @@ def forward(self, input: TensorOrTensors) -> TensorOrTensors:  # type: ignore
             # Empty sequential module is not illegal.
             return input
 
-        # Prepare separate CUDA streams only for copy.
-        copy_streams = self._ensure_copy_streams()
-
         # Divide a mini-batch into micro-batches.
         batches = scatter(input, self.chunks)
 
@@ -330,7 +317,7 @@ def forward(self, input: TensorOrTensors) -> TensorOrTensors:  # type: ignore
             checkpoint_stop = 0
 
         # Run pipeline parallelism.
-        pipeline = Pipeline(batches, self.partitions, self.devices, copy_streams, checkpoint_stop)
+        pipeline = Pipeline(batches, self.partitions, self.devices, checkpoint_stop)
         pipeline.run()
 
         # Merge the micro-batches into one mini-batch.

diff --git a/torchgpipe/pipeline.py b/torchgpipe/pipeline.py
@@ -10,7 +10,7 @@
 from torchgpipe.copy import Copy, Wait
 from torchgpipe.dependency import Fork, Join
 from torchgpipe.microbatch import Batch
-from torchgpipe.stream import AbstractStream, CPUStream, current_stream
+from torchgpipe.stream import AbstractStream, current_stream, new_stream
 from torchgpipe.worker import Task, spawn_workers
 
 __all__: List[str] = []
@@ -64,7 +64,6 @@ def __init__(self,
                  batches: List[Batch],
                  partitions: List[nn.Sequential],
                  devices: Optional[List[torch.device]] = None,
-                 copy_streams: Optional[List[List[AbstractStream]]] = None,
                  checkpoint_stop: int = 0,
                  ) -> None:
         self.batches = batches
@@ -74,9 +73,10 @@ def __init__(self,
             devices = [torch.device('cpu') for _ in partitions]
         self.devices = devices
 
-        if copy_streams is None:
-            copy_streams = [[CPUStream] * len(batches) for _ in partitions]
-        self.copy_streams = copy_streams
+        # NOTE(sublee): We don't need to manage a pool of CUDA streams because
+        # PyTorch already manages it.
+        # See https://github.com/pytorch/pytorch/pull/9938
+        self.copy_streams = [[new_stream(d) for _ in self.batches] for d in devices]
 
         self.checkpoint_stop = checkpoint_stop