<a href="https://colab.research.google.com/github/hmh10098/d2l/blob/main/12_1_to_3_ComputationalPerformance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def add_():
    return '''
def add(a, b):
    return a + b
'''

def fancy_func_():
    return '''
def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g
'''

def evoke_():
    return add_() + fancy_func_() + 'print(fancy_func(1, 2, 3, 4))'

prog = evoke_()
print(prog)
y = compile(prog, '', 'exec')
exec(y)


def add(a, b):
    return a + b

def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g
print(fancy_func(1, 2, 3, 4))
10


In [2]:
!pip install d2l

Collecting d2l
  Downloading d2l-0.17.0-py3-none-any.whl (83 kB)
[?25l[K     |████                            | 10 kB 27.3 MB/s eta 0:00:01[K     |███████▉                        | 20 kB 18.6 MB/s eta 0:00:01[K     |███████████▉                    | 30 kB 11.2 MB/s eta 0:00:01[K     |███████████████▊                | 40 kB 9.1 MB/s eta 0:00:01[K     |███████████████████▊            | 51 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████▋        | 61 kB 5.8 MB/s eta 0:00:01[K     |███████████████████████████▋    | 71 kB 5.5 MB/s eta 0:00:01[K     |███████████████████████████████▌| 81 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████████| 83 kB 1.3 MB/s 
Installing collected packages: d2l
Successfully installed d2l-0.17.0


In [3]:
import torch
from torch import nn
from d2l import torch as d2l

# Factory for networks
def get_net():
    net = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 128),
                        nn.ReLU(), nn.Linear(128, 2))
    return net

x = torch.randn(size=(1, 512))
net = get_net()
net(x)

tensor([[-0.0728,  0.0168]], grad_fn=<AddmmBackward0>)

In [4]:
net = torch.jit.script(net)
net(x)

tensor([[-0.0728,  0.0168]], grad_fn=<AddmmBackward0>)

In [5]:
class Benchmark:
    """For measuring running time."""
    def __init__(self, description='Done'):
        self.description = description

    def __enter__(self):
        self.timer = d2l.Timer()
        return self

    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')

In [6]:
net = get_net()
with Benchmark('Without torchscript'):
    for i in range(1000):
        net(x)

net = torch.jit.script(net)
with Benchmark('With torchscript'):
    for i in range(1000):
        net(x)

Without torchscript: 0.1384 sec
With torchscript: 0.1092 sec


In [7]:
net.save('my_mlp')
!ls -lh my_mlp*

-rw-r--r-- 1 root root 651K Nov 22 10:25 my_mlp


# 11.2


In [8]:
import os
import subprocess
import numpy
import torch
from torch import nn
from d2l import torch as d2l

In [9]:
# Warmup for GPU computation
device = d2l.try_gpu()
a = torch.randn(size=(1000, 1000), device=device)
b = torch.mm(a, a)

with d2l.Benchmark('numpy'):
    for _ in range(10):
        a = numpy.random.normal(size=(1000, 1000))
        b = numpy.dot(a, a)

with d2l.Benchmark('torch'):
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)

numpy: 1.1895 sec
torch: 0.0011 sec


In [10]:
with d2l.Benchmark():
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)
    torch.cuda.synchronize(device)

Done: 0.0212 sec


In [11]:
x = torch.ones((1, 2), device=device)
y = torch.ones((1, 2), device=device)
z = x * y + 2
z

tensor([[3., 3.]], device='cuda:0')

In [13]:
devices = d2l.try_all_gpus()

def run(x):
    return [x.mm(x) for _ in range(50)]

x_gpu1 = torch.rand(size=(4000, 4000), device=devices[0])
x_gpu2 = torch.rand(size=(4000, 4000), device=devices[0])

In [16]:
run(x_gpu1)
run(x_gpu2)  # Warm-up all devices
torch.cuda.synchronize(devices[0])

with d2l.Benchmark('GPU1 time'):
    run(x_gpu1)
    torch.cuda.synchronize(devices[0])

with d2l.Benchmark('GPU2 time'):
    run(x_gpu2)
    torch.cuda.synchronize(devices[0])

GPU1 time: 2.9992 sec
GPU2 time: 2.9944 sec


In [18]:
y

tensor([[1., 1.]], device='cuda:0')

In [20]:
def copy_to_cpu(x, non_block=False):
  return [y.to('cpu', non_blocking=non_block) for y in x]

with d2l.Benchmark('Run on GPU1'):
    y = run(x_gpu1)
    torch.cuda.synchronize()

with d2l.Benchmark('Copy to CPU'):
  y_cpu = copy_to_cpu(y)
  torch.cuda.synchronize()

Run on GPU1: 2.9984 sec
Copy to CPU: 2.0773 sec


In [23]:
with d2l.Benchmark('Run on GPU1 and copy to CPU'):
  y = run(x_gpu1)
  y_cpu = copy_to_cpu(y, True)
  torch.cuda.synchronize()

Run on GPU1 and copy to CPU: 3.8908 sec
