# Imports

In [1]:
%matplotlib inline
import numpy as np
import math
import torch
from torch import nn
import time
from matplotlib import pyplot as plt

# Symbolic Programming

A simulation of a symbolic program (Since we are still using the Python interpreter here)

In [2]:
def add_():
    return '''
def add(a, b):
    return a + b
'''

def fancy_func_():
    return '''
def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g
'''

def evoke_():
    return add_() + fancy_func_() + 'print(fancy_func(1, 2, 3, 4))'

prog = evoke_()
print(prog)
y = compile(prog, '', 'exec')
exec(y)


def add(a, b):
    return a + b

def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g
print(fancy_func(1, 2, 3, 4))
10


# Hybridization

Hybridization takes advantage of the imperative programming approach which makes the code very easy to read and debug and the fast execution and portability of symbolic programs that can avoid the bottleneck of the python interpreter

## Hybridizing the Sequential Class

In [3]:
# Factory for networks
def get_net():
    net = nn.Sequential(nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2))
    return net

x = torch.randn(size=(1, 512))
net = get_net()
net(x)

tensor([[-0.0480, -0.2093]], grad_fn=<AddmmBackward0>)

In [4]:
net = torch.jit.script(net)
net(x)

tensor([[-0.0480, -0.2093]], grad_fn=<AddmmBackward0>)

### Benchmarking the performance improvement

While the compilation and execution seems identical to the earlier network, we can benchmark the performance to show the improvement made with hybridization

In [5]:
class Timer:
    """Recording multiple running times."""
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
        """Start the timer."""
        self.tik = time.time()

    def stop(self):
        """Stop the timer and record the time in a list."""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """Return the average time."""
        return sum(self.times) / len(self.times)

    def sum(self):
        """Return the sum of time."""
        return sum(self.times)

    def cumsum(self):
        """Return the accumulated time."""
        return np.array(self.times).cumsum().tolist()

In [6]:
class Benchmark:
    """For measuring running time."""
    def __init__(self, description='Done'):
        self.description = description

    def __enter__(self):
        self.timer = Timer()
        return self

    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')

In [7]:
net = get_net()
with Benchmark('Without torchscript'):
    for i in range(1000): net(x)

net = torch.jit.script(net)
with Benchmark('With torchscript'):
    for i in range(1000): net(x)

Without torchscript: 0.0760 sec
With torchscript: 0.0700 sec


Here, we can see the difference in performance. As we create more complicated networks, the performance difference will matter more

### Serialization

We can serialize (save) the model and parameters on the disk to use in a front-end agnostic manner

In [8]:
net.save('my_mlp')

# Asynchronous Computation

## Asynchrony via backend

In [9]:
# Warmup for GPU computation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
a = torch.randn(size=(1000, 1000), device=device)
b = torch.mm(a, a)

with Benchmark('numpy'):
    for _ in range(10):
        a = np.random.normal(size=(1000, 1000))
        b = np.dot(a, a)

with Benchmark('torch'):
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)

Using device: cuda
numpy: 0.5140 sec
torch: 0.0010 sec


Clearly operations undertaken using numpy (CPU) are slower than operations that leverage the GPU like with PyTorch (GPU)

In [10]:
with Benchmark():
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)
    torch.cuda.synchronize(device)

Done: 0.0020 sec


It's also clear that the advantage is not just because of the GPU. The PyTorch backend (written in C++) processes variables asynchonously while the front end (Python notebook) waits for the operations to execute

In [11]:
x = torch.ones((1, 2), device=device)
y = torch.ones((1, 2), device=device)
z = x * y + 2
z

tensor([[3., 3.]], device='cuda:0')

Behind the scenes, the front end (Python in this instance) will execute the first 3 statements and will only wait for the backend (C++) before outputting the value of z

## Barriers and Blockers

In [12]:
with Benchmark('torch conversion'):
    b = torch.mm(a, a)
    b = b.cpu()
    
with Benchmark('numpy conversion'):
    b = torch.mm(a, a)
    b = b.cpu().numpy()

with Benchmark('scalar conversion'):
    b = torch.mm(a, a)
    b = b.sum().item()

torch conversion: 0.0020 sec
numpy conversion: 0.0010 sec
scalar conversion: 0.0200 sec


.numpy() and .item() conversion doesn't take advantage of asynchrony due to the implementation of numpy. Thus, these actions can slow down a program

# Automatic Parallelism

## Parallel Computation on GPUs

In [13]:
multiple_gpus = False
device_count = torch.cuda.device_count()
devices = []
if device_count == 2:
    multiple_gpus = True
for i in range(torch.cuda.device_count()):
    print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
    devices.append(torch.cuda.get_device_name(i))
print(f"Multiple GPUs: {multiple_gpus}")

  Device 0: NVIDIA GeForce RTX 5070
Multiple GPUs: False


In [14]:
def run(x):
    return [x.mm(x) for _ in range(50)]
if multiple_gpus:
    x_gpu1 = torch.rand(size=(4000, 4000), device=devices[0])
    x_gpu2 = torch.rand(size=(4000, 4000), device=devices[1])
else:
    print(f"You need at least 2 GPUs to run this example")

You need at least 2 GPUs to run this example


In [15]:
if multiple_gpus:    
    run(x_gpu1)
    run(x_gpu2)  # Warm-up all devices
    torch.cuda.synchronize(devices[0])
    torch.cuda.synchronize(devices[1])

    with Benchmark('GPU1 time'):
        run(x_gpu1)
        torch.cuda.synchronize(devices[0])

    with Benchmark('GPU2 time'):
        run(x_gpu2)
        torch.cuda.synchronize(devices[1])
else:
    print(f"You need at least 2 GPUs to run this example")

You need at least 2 GPUs to run this example


In [16]:
if multiple_gpus:
    with Benchmark('GPU1 & GPU2'):
        run(x_gpu1)
        run(x_gpu2)
        torch.cuda.synchronize()
else:
    print(f"You need at least 2 GPUs to run this example")

You need at least 2 GPUs to run this example


## Parallel computation and communication

In [18]:
def copy_to_cpu(x, non_blocking=False):
    return [y.to('cpu', non_blocking=non_blocking) for y in x]

device = torch.device('cuda' if torch.cuda.is_available() else 'none')
x_gpu1 = torch.rand(size=(4000, 4000), device=device)
run(x_gpu1)  # Warm-up
with Benchmark('Run on GPU1'):
    y = run(x_gpu1)
    torch.cuda.synchronize()
with Benchmark('Copy to CPU'):
    y_cpu = copy_to_cpu(y)
    torch.cuda.synchronize()

Run on GPU1: 0.6140 sec
Copy to CPU: 1.2468 sec


Clearly copying the tensor to CPU is expensive

In [20]:
with Benchmark('Run on GPU1 and copy to CPU'):
    y = run(x_gpu1)
    y_cpu = copy_to_cpu(y, True)
    torch.cuda.synchronize()

Run on GPU1 and copy to CPU: 0.7340 sec
