# Basic experiments with GPU programming in julia

This is roughly based on the CUArrays package introduction

In [1]:
using BenchmarkTools
using Test
using Statistics

## Element-wise product experiments

Test performance of a simple (large) vector Hadamard product

In [2]:
N=2^28
x = fill(2.0f0, N)
y = fill(3.0f0, N)
;

### Single threaded on a CPU, using broadcasting:

In [3]:
@test all(x .* y .== 6.0f0)
@benchmark z = x .* y

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  4
  --------------
  minimum time:     537.403 ms (0.03% GC)
  median time:      565.518 ms (4.75% GC)
  mean time:        565.881 ms (5.18% GC)
  maximum time:     607.860 ms (11.14% GC)
  --------------
  samples:          9
  evals/sample:     1

### Single threaded on a CPU, using loops:

In [4]:
function serial_mul(x, y)
    z = similar(x)
    for i in eachindex(x,y,z)
        @inbounds z[i] = x[i] * y[i]
    end
    return z
end

@test all(serial_mul(x, y) .== 6.0f0)
@benchmark z = serial_mul(x, y)

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  2
  --------------
  minimum time:     532.815 ms (0.03% GC)
  median time:      562.928 ms (4.81% GC)
  mean time:        563.425 ms (5.24% GC)
  maximum time:     607.163 ms (11.23% GC)
  --------------
  samples:          9
  evals/sample:     1

### Multi threaded on a CPU:

In [5]:
function threaded_mul(x, y)
    z = similar(x)
    Threads.@threads for i in eachindex(x,y,z)
        @inbounds z[i] = x[i] * y[i]
    end
    return z
end

@test all(threaded_mul(x, y) .== 6.0f0)
@benchmark z = threaded_mul(x, y)

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  3
  --------------
  minimum time:     86.265 ms (12.86% GC)
  median time:      119.062 ms (35.40% GC)
  mean time:        121.156 ms (34.07% GC)
  maximum time:     190.826 ms (43.43% GC)
  --------------
  samples:          42
  evals/sample:     1

### GPU with broadcasting (multithreaded and grided)

In [6]:
using CuArrays

x_gpu = cufill(2.0f0, N)
y_gpu = cufill(3.0f0, N)
;

In [7]:
function broadcast_gpu_mul(x, y)
    CuArrays.@sync z = x .* y
    return z
end

@test all(broadcast_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = broadcast_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  2.27 KiB
  allocs estimate:  62
  --------------
  minimum time:     7.666 ms (0.00% GC)
  median time:      8.759 ms (0.00% GC)
  mean time:        48.827 ms (37.97% GC)
  maximum time:     272.222 ms (47.15% GC)
  --------------
  samples:          105
  evals/sample:     1

### Single threaded on GPU (custom kernel)

In [8]:
using CUDAnative

In [9]:
function serial_gpu_mul!(z, x, y)
    for i in 1:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function serial_gpu_mul(x, y)
    z = similar(x)
    CuArrays.@sync begin
        @cuda serial_gpu_mul!(z, x, y)
    end
    return z
end

@test all(serial_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = serial_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  912 bytes
  allocs estimate:  34
  --------------
  minimum time:     12.004 s (0.00% GC)
  median time:      12.004 s (0.00% GC)
  mean time:        12.004 s (0.00% GC)
  maximum time:     12.004 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

### Multi threaded on GPU, single SM

In [10]:
function threaded_gpu_mul!(z, x, y)
    index = threadIdx().x
    stride = blockDim().x
    
    for i in index:stride:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function threaded_gpu_mul(x, y)
    z = similar(x)
    CuArrays.@sync begin
        @cuda threads=1024 threaded_gpu_mul!(z, x, y)
    end
    return z
end

@test all(threaded_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = threaded_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  944 bytes
  allocs estimate:  36
  --------------
  minimum time:     87.682 ms (0.00% GC)
  median time:      89.418 ms (0.00% GC)
  mean time:        130.864 ms (15.73% GC)
  maximum time:     381.590 ms (40.54% GC)
  --------------
  samples:          39
  evals/sample:     1

### Multithreaded and grided on GPU

In [11]:
function grided_gpu_mul!(z, x, y)
    index = threadIdx().x + (blockIdx().x - 1)*blockDim().x
    stride = blockDim().x * gridDim().x
    
    for i in index:stride:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function grided_gpu_mul(x, y)
    z = similar(x)
    numthreads = 64
    numblocks = ceil(Int, length(x)/numthreads)
    CuArrays.@sync begin
        @cuda threads=numthreads blocks=numblocks grided_gpu_mul!(z, x, y)
    end
    return z
end

grided_gpu_mul (generic function with 1 method)

In [12]:
@test all(grided_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = grided_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  960 bytes
  allocs estimate:  36
  --------------
  minimum time:     7.398 ms (0.00% GC)
  median time:      8.396 ms (0.00% GC)
  mean time:        46.954 ms (35.94% GC)
  maximum time:     295.116 ms (49.44% GC)
  --------------
  samples:          111
  evals/sample:     1

### Multiple GPUs

In [13]:
numgpus = 10
n = ceil(Int, N/numgpus)

function devicefill(dev, value, shape)
    device!(dev) do
        return cufill(value, shape)
    end
end

x_gpus = [devicefill(dev, 2.0f0, n) for dev in 0:(numgpus-1)]
y_gpus = [devicefill(dev, 3.0f0, n) for dev in 0:(numgpus-1)]
z_gpus = [devicefill(dev, 2.0f0, n) for dev in 0:(numgpus-1)]
;

In [14]:
function multi_gpu_mul!(z_gpus, x_gpus, y_gpus)
    function kernel!(z, x, y)
        index = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        stride = blockDim().x * gridDim().x
    
        for i in index:stride:length(x)
            @inbounds z[i] = x[i] * y[i]
        end
        return nothing
    end
    
    # launch each kernel
    for i in 1:numgpus
        device!(i-1) do
            numthreads = 128
            numblocks = ceil(Int, length(x_gpus[i])/numthreads)
            @cuda threads=numthreads blocks=numblocks kernel!(z_gpus[i], x_gpus[i], y_gpus[i])
        end
    end
    # resynchronize each device (for timing purposes)
    for i in 1:numgpus
        device!(i-1) do
            CuArrays.@sync 1
        end
    end
    return z_gpus
end

@test all(vcat((Array.(multi_gpu_mul!(z_gpus, x_gpus, y_gpus)))...) .== 6.0f0)
CuArrays.@time multi_gpu_mul!(z_gpus, x_gpus, y_gpus)
@benchmark multi_gpu_mul!(z_gpus, x_gpus, y_gpus)

  0.001085 seconds (252 CPU allocations: 6.781 KiB)


BenchmarkTools.Trial: 
  memory estimate:  6.78 KiB
  allocs estimate:  252
  --------------
  minimum time:     838.764 μs (0.00% GC)
  median time:      977.276 μs (0.00% GC)
  mean time:        992.443 μs (0.35% GC)
  maximum time:     28.716 ms (60.22% GC)
  --------------
  samples:          4965
  evals/sample:     1

## Tensor contraction experiment

Calculate a simple tensor contraction, i.e.

$$z_{i,j,k} = \sum_{l} x_{l,i,j} y_{l, k}$$

In [15]:
using Random
rng = MersenneTwister(1842)

t1 = randn(rng, Float32, (2000, 600, 300))
t2 = randn(rng, Float32, (2000, 300))
;

Calculate the number of single precision floating point operations involved in this calculation:

In [16]:
numspflops = *(size(t1)[2:end]..., size(t2)[2:end]...) * (2 * size(t1)[1] - 1)

215946000000

### CPU single threaded approach

In [17]:
t3 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_loop!(t3, t1, t2)
    for k in 1:size(t3)[3]
        for j in 1:size(t3)[2]
            for i in 1:size(t3)[1]
                @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]

                for l in 2:size(t1)[1]
                    @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                end
            end
        end
    end
end

trial = @benchmark calc_t3_loop!($t3, $t1, $t2)
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 2.19570752429342
trial = Trial(98.349 s)


BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     98.349 s (0.00% GC)
  median time:      98.349 s (0.00% GC)
  mean time:        98.349 s (0.00% GC)
  maximum time:     98.349 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

### CPU multi-threaded approach

In [18]:
t3_2 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_loop_threaded!(t3, t1, t2)
    Threads.@threads for k in 1:size(t3)[3]
        for j in 1:size(t3)[2]
            for i in 1:size(t3)[1]
                @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]

                for l in 2:size(t1)[1]
                    @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                end
            end
        end
    end
end

trial = @benchmark calc_t3_loop_threaded!($t3_2, $t1, $t2)
@test maximum(abs.(t3_2 - t3)) .< 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 66.07622605099742
trial = Trial(3.172 s)


BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  1
  --------------
  minimum time:     3.172 s (0.00% GC)
  median time:      3.268 s (0.00% GC)
  mean time:        3.268 s (0.00% GC)
  maximum time:     3.364 s (0.00% GC)
  --------------
  samples:          2
  evals/sample:     1

### Single GPU

In [19]:
t1cu = CuArray(t1)
t2cu = CuArray(t2)
t3cu = CuArray{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_gpu!(t3, t1, t2)
    function kernel!(t3, t1, t2)
        xindex = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        yindex = threadIdx().y + (blockIdx().y - 1)*blockDim().y
        zindex = threadIdx().z + (blockIdx().z - 1)*blockDim().z
        
        xstride = blockDim().x * gridDim().x
        ystride = blockDim().y * gridDim().y
        zstride = blockDim().z * gridDim().z
    
        for k in zindex:zstride:size(t3)[3]
            for j in yindex:ystride:size(t3)[2]
                for i in xindex:xstride:size(t3)[1]
                    @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]
                    
                    for l in 2:size(t1)[1]
                        @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                    end
                end
            end
        end
        
        return nothing
    end

    numthreads = (4,4,4)
    numblocks = map(x -> ceil(Int, x), (size(t3) ./ numthreads))
    CuArrays.@sync begin
        @cuda threads=numthreads blocks=numblocks kernel!(t3, t1, t2)
    end
end

trial = @benchmark calc_t3_gpu!($t3cu, $t1cu, $t2cu)
@test maximum(abs.(Array(t3cu) - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 188.65600106701163
trial = Trial(1.142 s)


BenchmarkTools.Trial: 
  memory estimate:  688 bytes
  allocs estimate:  17
  --------------
  minimum time:     1.142 s (0.00% GC)
  median time:      1.145 s (0.00% GC)
  mean time:        1.144 s (0.00% GC)
  maximum time:     1.146 s (0.00% GC)
  --------------
  samples:          5
  evals/sample:     1

### Multiple GPUs

In [20]:
function allocgpuarrays(t1, t2, numgpus)
    t1s = Array{CuArray{Float32,3}}(undef,0)
    t2s = Array{CuArray{Float32,2}}(undef,0)
    t3s = Array{CuArray{Float32,3}}(undef,0)
    slabsize = ceil(Int, size(t1)[2]/numgpus)
    for i in 1:numgpus
        indexstart = (i-1)*slabsize + 1
        indexend = min(i*slabsize, size(t1)[2])
        device!(i-1) do
            push!(t1s, CuArray(t1[:,indexstart:indexend,:]))
            push!(t2s, CuArray(t2))
            push!(t3s, CuArray{Float32}(undef, indexend - indexstart + 1, size(t1)[3], size(t2)[2]))
        end
    end
    
    return t1s, t2s, t3s
end

t1s, t2s, t3s = allocgpuarrays(t1, t2, numgpus);

In [21]:
function calc_t3_gpu_nosync!(t3, t1, t2)
    function kernel!(t3, t1, t2)
        xindex = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        yindex = threadIdx().y + (blockIdx().y - 1)*blockDim().y
        zindex = threadIdx().z + (blockIdx().z - 1)*blockDim().z
        
        xstride = blockDim().x * gridDim().x
        ystride = blockDim().y * gridDim().y
        zstride = blockDim().z * gridDim().z
    
        for k in zindex:zstride:size(t3)[3]
            for j in yindex:ystride:size(t3)[2]
                for i in xindex:xstride:size(t3)[1]
                    @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]
                    
                    for l in 2:size(t1)[1]
                        @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                    end
                end
            end
        end
        
        return nothing
    end

    numthreads = (8,4,4)
    numblocks = map(x -> ceil(Int, x), (size(t3) ./ numthreads))
    @cuda threads=numthreads blocks=numblocks kernel!(t3, t1, t2)
end
;

In [22]:
function calc_t3_gpus!(t3s, t1s, t2s)
    for i in 1:numgpus
        device!(i-1) do
            calc_t3_gpu_nosync!(t3s[i], t1s[i], t2s[i])
        end
    end
    # resynchronize each device (for timing purposes)
    for i in 1:numgpus
        device!(i-1) do
            CuArrays.@sync 1
        end
    end
end

trial = @benchmark calc_t3_gpus!($t3s, $t1s, $t2s)
@test maximum(abs.(vcat(map(Array,t3s)...) - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 1423.0433677583312
trial = Trial(150.753 ms)


BenchmarkTools.Trial: 
  memory estimate:  7.56 KiB
  allocs estimate:  262
  --------------
  minimum time:     150.753 ms (0.00% GC)
  median time:      151.749 ms (0.00% GC)
  mean time:        151.752 ms (0.00% GC)
  maximum time:     152.874 ms (0.00% GC)
  --------------
  samples:          33
  evals/sample:     1

### TensorOperations library

In [23]:
import LinearAlgebra.BLAS
BLAS.set_num_threads(Sys.CPU_THREADS)

In [24]:
using TensorOperations

t3_3 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_library!(t3, t1, t2)
    @tensor t3[i,j,k] = t1[l,i,j] * t2[l,k]
end

trial = @benchmark calc_t3_library!($t3_3, $t1, $t2)
@test maximum(abs.(t3_3 - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 791.9345819920223
trial = Trial(259.649 ms)


BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     259.649 ms (0.00% GC)
  median time:      272.682 ms (0.00% GC)
  mean time:        299.894 ms (0.00% GC)
  maximum time:     370.240 ms (0.00% GC)
  --------------
  samples:          17
  evals/sample:     1

### LinearAlgebra library with CPU

In [25]:
using LinearAlgebra

t3_4 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_matmul!(t3, t1, t2)
    t1_2d_t = transpose(reshape(t1, size(t1)[1], size(t1)[2]*size(t1)[3]))
    t3_2d = reshape(t3, size(t3)[1]*size(t3)[2], size(t3)[3])
    mul!(t3_2d, t1_2d_t, t2)
end

trial = @benchmark calc_t3_matmul!($t3_4, $t1, $t2)
@test maximum(abs.(t3_4 - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 819.8135214851668
trial = Trial(252.848 ms)


BenchmarkTools.Trial: 
  memory estimate:  192 bytes
  allocs estimate:  4
  --------------
  minimum time:     252.848 ms (0.00% GC)
  median time:      263.409 ms (0.00% GC)
  mean time:        282.770 ms (0.00% GC)
  maximum time:     523.194 ms (0.00% GC)
  --------------
  samples:          18
  evals/sample:     1

### LinearAlgebra library with a single GPU

In [30]:
t1_2d_t_cu = CuArray(Array(transpose(reshape(t1, size(t1)[1], size(t1)[2]*size(t1)[3]))))
t2cu = CuArray(t2)
t3_2d_cu = CuArray{Float32}(undef, size(t1)[2] * size(t1)[3], size(t2)[2])

function calc_t3_matmul_gpu!(t3_2d, t1_2d_t, t2)
    CuArrays.@sync begin
        LinearAlgebra.mul!(t3_2d, t1_2d_t, t2)
    end
end

trial = @benchmark calc_t3_matmul_gpu!($t3_2d_cu, $t1_2d_t_cu, $t2cu)
t3_5 = reshape(Array(t3_2d_cu), size(t1)[2], size(t1)[3], size(t2)[2])
@test maximum(abs.(t3_5 - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 11540.25713847197
trial = Trial(17.653 ms)


BenchmarkTools.Trial: 
  memory estimate:  304 bytes
  allocs estimate:  6
  --------------
  minimum time:     17.653 ms (0.00% GC)
  median time:      18.712 ms (0.00% GC)
  mean time:        18.737 ms (0.00% GC)
  maximum time:     19.850 ms (0.00% GC)
  --------------
  samples:          267
  evals/sample:     1

### LinearAlgebra library with multiple GPUs

In [31]:
function allocgpuarrays_2d(t1, t2, numgpus)
    t1_2d_t = transpose(reshape(t1, size(t1)[1], size(t1)[2]*size(t1)[3]))

    t1s_2d = Array{CuArray{Float32,2}}(undef,0)
    t2s = Array{CuArray{Float32,2}}(undef,0)
    t3s_2d = Array{CuArray{Float32,2}}(undef,0)
    
    slabsize = ceil(Int, size(t1_2d_t)[1]/numgpus)
    
    for i in 1:numgpus
        indexstart = (i-1)*slabsize + 1
        indexend = min(i*slabsize, size(t1_2d_t)[1])
        device!(i-1) do
            push!(t1s_2d, CuArray(t1_2d_t[indexstart:indexend,:]))
            push!(t2s, CuArray(t2))
            push!(t3s_2d, CuArray{Float32}(undef, indexend - indexstart + 1, size(t2)[2]))
        end
    end
    
    return t1s_2d, t2s, t3s_2d
end

t1s_2d_t, t2s, t3s_2d = allocgpuarrays_2d(t1, t2, numgpus);

In [32]:
function calc_t3_matmul_gpus!(t3s_2d, t1s_2d_t, t2s)
    for i in 1:numgpus
        device!(i-1) do
            LinearAlgebra.mul!(t3s_2d[i], t1s_2d_t[i], t2s[i])
        end
    end
    # resynchronize each device (for timing purposes)
    for i in 1:numgpus
        device!(i-1) do
            CuArrays.@sync 1
        end
    end
end

trial = @benchmark calc_t3_matmul_gpus!($t3s_2d, $t1s_2d_t, $t2s)
t3_6 = reshape(vcat(map(Array,t3s_2d)...), size(t1)[2], size(t1)[3], size(t2)[2])
@test maximum(abs.(t3_6 - t3)) < 2e-5*√(size(t1)[1]) # confirm results match the single threaded results
print("GFLOPS (SP): $(numspflops/median(trial).time)\n")
@show trial

GFLOPS (SP): 90839.05163030619
trial = Trial(2.129 ms)


BenchmarkTools.Trial: 
  memory estimate:  5.53 KiB
  allocs estimate:  182
  --------------
  minimum time:     2.129 ms (0.00% GC)
  median time:      2.377 ms (0.00% GC)
  mean time:        2.397 ms (0.00% GC)
  maximum time:     7.744 ms (0.00% GC)
  --------------
  samples:          2074
  evals/sample:     1