One fake input for ResNet-101 performance benchmark

kakaobrain · Aug 19, 2019 · eeadfdf · eeadfdf
1 parent 49a8f5f
commit eeadfdf
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 42 deletions.
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
@@ -10,11 +10,11 @@ ResNet-101 Performance Benchmark
 ==========  ===================  =======
 Experiment  Throughput           Speedup
 ==========  ===================  =======
-naive-1     100.506 samples/sec   1.000x
-pipeline-1   73.925 samples/sec   0.736x
-pipeline-2  135.691 samples/sec   1.350x
-pipeline-4  230.216 samples/sec   2.291x
-pipeline-8  312.945 samples/sec   3.114x
+naive-1     100.922 samples/sec   1.000x
+pipeline-1   74.128 samples/sec   0.735x
+pipeline-2  136.929 samples/sec   1.357x
+pipeline-4  238.058 samples/sec   2.359x
+pipeline-8  328.563 samples/sec   3.256x
 ==========  ===================  =======
 
 The code is reproducible on Tesla P40 GPUs, and the experiment details

diff --git a/examples/resnet101_performance_benchmark/README.md b/examples/resnet101_performance_benchmark/README.md
@@ -6,25 +6,25 @@ reported in Figure 3(b) of the GPipe paper.
 The benchmark cares of only training performance rather than the model's
 accuracy. The batch size is adjusted to achieve higher throughput without any
 large batch training tricks. This example also doesn't feed actual dataset like
-ImageNet or CIFAR-100. Instead, a fake dataset with 50k 3×224×224 tensors is
-used to eliminate data loading overhead.
+ImageNet or CIFAR-100. Instead, fake 3×224×224 tensors over 10 labels are used
+to eliminate data loading overhead.
 
 Every experiment setting is optimized for Tesla P40 GPUs.
 
 ## Result
 
 Experiment | Throughput          | Speed up
 ---------- | ------------------: | -------:
-naive-1    | 100.506 samples/sec |   1.000x
-pipeline-1 |  73.925 samples/sec |   0.736x
-pipeline-2 | 135.691 samples/sec |   1.350x
-pipeline-4 | 230.216 samples/sec |   2.291x
-pipeline-8 | 312.945 samples/sec |   3.114x
+naive-1    | 100.922 samples/sec |   1.000x
+pipeline-1 |  74.128 samples/sec |   0.735x
+pipeline-2 | 136.929 samples/sec |   1.357x
+pipeline-4 | 238.058 samples/sec |   2.359x
+pipeline-8 | 328.563 samples/sec |   3.256x
 
 ## Optimized Environment
 
 - Python 3.6.7
-- PyTorch 1.1.0
+- PyTorch 1.2.0
 - CUDA 9.0.176
 - 8 Tesla P40 GPUs
 - 8+ Intel E5-2650 v4 CPUs

diff --git a/examples/resnet101_performance_benchmark/main.py b/examples/resnet101_performance_benchmark/main.py
@@ -1,6 +1,5 @@
 """ResNet-101 Performance Benchmark"""
 import platform
-import random
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple, cast
 
@@ -9,7 +8,6 @@
 import torch.nn as nn
 from torch.nn import functional as F
 from torch.optim import SGD
-from torch.utils.data import DataLoader
 
 from resnet import resnet101
 from torchgpipe import GPipe
@@ -77,14 +75,6 @@ def pipeline8(model: nn.Module, devices: List[int]) -> Stuffs:
 }
 
 
-class RandomDataset(torch.utils.data.Dataset):
-    def __len__(self) -> int:
-        return 50000
-
-    def __getitem__(self, i: int) -> Tuple[torch.Tensor, int]:
-        return torch.rand(3, 224, 224), random.randrange(10)
-
-
 BASE_TIME: float = 0
 
 
@@ -170,17 +160,10 @@ def cli(ctx: click.Context,
     out_device = _devices[-1]
 
     # This experiment cares about only training performance, rather than
-    # accuracy. To eliminate any overhead due to data loading, we use a fake
-    # dataset with random 224x224 images over 10 labels.
-    dataset = RandomDataset()
-    loader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        num_workers=1,
-        pin_memory=True,
-        drop_last=False,
-    )
+    # accuracy. To eliminate any overhead due to data loading, we use fake
+    # random 224x224 images over 10 labels.
+    input = torch.rand(batch_size, 3, 224, 224, device=in_device)
+    target = torch.randint(10, (batch_size,), device=out_device)
 
     # HEADER ======================================================================================
 
@@ -203,11 +186,9 @@ def run_epoch(epoch: int) -> Tuple[float, float]:
         tick = time.time()
 
         data_trained = 0
-        for i, (input, target) in enumerate(loader):
-            data_trained += len(input)
-
-            input = input.to(in_device, non_blocking=True)
-            target = target.to(out_device, non_blocking=True)
+        steps = 50000 // batch_size
+        for i in range(steps):
+            data_trained += batch_size
 
             output = model(input)
             loss = F.cross_entropy(output, target)
@@ -217,7 +198,7 @@ def run_epoch(epoch: int) -> Tuple[float, float]:
             optimizer.zero_grad()
 
             # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated)
-            percent = i / len(loader) * 100
+            percent = i / steps * 100
             throughput = data_trained / (time.time()-tick)
             log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)'
                 '' % (epoch+1, epochs, percent, throughput), clear=True, nl=False)
@@ -227,7 +208,7 @@ def run_epoch(epoch: int) -> Tuple[float, float]:
 
         # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch
         elapsed_time = tock - tick
-        throughput = len(dataset) / elapsed_time
+        throughput = batch_size * steps / elapsed_time
         log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch'
             '' % (epoch+1, epochs, throughput, elapsed_time), clear=True)
 

diff --git a/examples/resnet101_performance_benchmark/requirements.txt b/examples/resnet101_performance_benchmark/requirements.txt
@@ -1,2 +1,2 @@
 click==7.0
-torch==1.1.0
+torch==1.2.0