# Basic experiments with GPU programming in julia

This is roughly based on the CUArrays package introduction

In [1]:
using BenchmarkTools
using Test

## Element-wise product experiments

Test performance of a simple (large) vector Hadamard product

In [2]:
N=2^28
x = fill(2.0f0, N)
y = fill(3.0f0, N)
;

### Single threaded on a CPU, using broadcasting:

In [3]:
@test all(x .* y .== 6.0f0)
@benchmark z = x .* y

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  4
  --------------
  minimum time:     581.840 ms (0.03% GC)
  median time:      662.675 ms (10.19% GC)
  mean time:        666.669 ms (9.23% GC)
  maximum time:     748.535 ms (16.65% GC)
  --------------
  samples:          8
  evals/sample:     1

### Single threaded on a CPU, using loops:

In [4]:
function serial_mul(x, y)
    z = similar(x)
    for i in eachindex(x,y,z)
        @inbounds z[i] = x[i] * y[i]
    end
    return z
end

@test all(serial_mul(x, y) .== 6.0f0)
@benchmark z = serial_mul(x, y)

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  2
  --------------
  minimum time:     589.937 ms (1.79% GC)
  median time:      639.185 ms (8.84% GC)
  mean time:        637.046 ms (7.69% GC)
  maximum time:     670.135 ms (8.09% GC)
  --------------
  samples:          8
  evals/sample:     1

### Multi threaded on a CPU:

In [5]:
function threaded_mul(x, y)
    z = similar(x)
    Threads.@threads for i in eachindex(x,y,z)
        @inbounds z[i] = x[i] * y[i]
    end
    return z
end

@test all(threaded_mul(x, y) .== 6.0f0)
@benchmark z = threaded_mul(x, y)

BenchmarkTools.Trial: 
  memory estimate:  1.00 GiB
  allocs estimate:  3
  --------------
  minimum time:     112.238 ms (10.17% GC)
  median time:      154.644 ms (49.85% GC)
  mean time:        157.796 ms (48.00% GC)
  maximum time:     227.870 ms (59.72% GC)
  --------------
  samples:          32
  evals/sample:     1

### GPU with broadcasting (multithreaded and grided)

In [6]:
using CuArrays

x_gpu = cufill(2.0f0, N)
y_gpu = cufill(3.0f0, N)
;

In [7]:
function broadcast_gpu_mul(x, y)
    CuArrays.@sync z = x .* y
    return z
end

@test all(broadcast_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = broadcast_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  2.27 KiB
  allocs estimate:  62
  --------------
  minimum time:     7.397 ms (0.00% GC)
  median time:      8.365 ms (0.00% GC)
  mean time:        49.250 ms (38.95% GC)
  maximum time:     297.484 ms (52.90% GC)
  --------------
  samples:          105
  evals/sample:     1

### Single threaded on GPU (custom kernel)

In [8]:
using CUDAnative

In [9]:
function serial_gpu_mul!(z, x, y)
    for i in 1:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function serial_gpu_mul(x, y)
    z = similar(x)
    CuArrays.@sync begin
        @cuda serial_gpu_mul!(z, x, y)
    end
    return z
end

@test all(serial_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = serial_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  912 bytes
  allocs estimate:  34
  --------------
  minimum time:     11.971 s (0.00% GC)
  median time:      11.971 s (0.00% GC)
  mean time:        11.971 s (0.00% GC)
  maximum time:     11.971 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

### Multi threaded on GPU, single SM

In [10]:
function threaded_gpu_mul!(z, x, y)
    index = threadIdx().x
    stride = blockDim().x
    
    for i in index:stride:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function threaded_gpu_mul(x, y)
    z = similar(x)
    CuArrays.@sync begin
        @cuda threads=1024 threaded_gpu_mul!(z, x, y)
    end
    return z
end

@test all(threaded_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = threaded_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  944 bytes
  allocs estimate:  36
  --------------
  minimum time:     88.018 ms (0.00% GC)
  median time:      89.355 ms (0.00% GC)
  mean time:        130.803 ms (15.96% GC)
  maximum time:     389.226 ms (41.50% GC)
  --------------
  samples:          39
  evals/sample:     1

### Multithreaded and grided on GPU

In [11]:
function grided_gpu_mul!(z, x, y)
    index = threadIdx().x + (blockIdx().x - 1)*blockDim().x
    stride = blockDim().x * gridDim().x
    
    for i in index:stride:length(x)
        @inbounds z[i] = x[i] * y[i]
    end
    return nothing
end

function grided_gpu_mul(x, y)
    z = similar(x)
    numthreads = 64
    numblocks = ceil(Int, length(x)/numthreads)
    CuArrays.@sync begin
        @cuda threads=numthreads blocks=numblocks grided_gpu_mul!(z, x, y)
    end
    return z
end

grided_gpu_mul (generic function with 1 method)

In [12]:
@test all(grided_gpu_mul(x_gpu, y_gpu) .== 6.0f0)
@benchmark z_gpu = grided_gpu_mul(x_gpu, y_gpu)

BenchmarkTools.Trial: 
  memory estimate:  960 bytes
  allocs estimate:  36
  --------------
  minimum time:     7.305 ms (0.00% GC)
  median time:      8.396 ms (0.00% GC)
  mean time:        49.701 ms (39.36% GC)
  maximum time:     301.306 ms (50.87% GC)
  --------------
  samples:          105
  evals/sample:     1

### Multiple GPUs

In [13]:
numgpus = 10
n = ceil(Int, N/numgpus)

function devicefill(dev, value, shape)
    device!(dev) do
        return cufill(value, shape)
    end
end

x_gpus = [devicefill(dev, 2.0f0, n) for dev in 0:(numgpus-1)]
y_gpus = [devicefill(dev, 3.0f0, n) for dev in 0:(numgpus-1)]
z_gpus = [devicefill(dev, 2.0f0, n) for dev in 0:(numgpus-1)]
;

In [14]:
function multi_gpu_mul!(z_gpus, x_gpus, y_gpus)
    function kernel!(z, x, y)
        index = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        stride = blockDim().x * gridDim().x
    
        for i in index:stride:length(x)
            @inbounds z[i] = x[i] * y[i]
        end
        return nothing
    end
    
    # launch each kernel
    for i in 1:numgpus
        device!(i-1) do
            numthreads = 128
            numblocks = ceil(Int, length(x_gpus[i])/numthreads)
            @cuda threads=numthreads blocks=numblocks kernel!(z_gpus[i], x_gpus[i], y_gpus[i])
        end
    end
    # resynchronize each device (for timing purposes)
    for i in 1:numgpus
        device!(i-1) do
            CuArrays.@sync 1
        end
    end
    return z_gpus
end

@test all(vcat((Array.(multi_gpu_mul!(z_gpus, x_gpus, y_gpus)))...) .== 6.0f0)
CuArrays.@time multi_gpu_mul!(z_gpus, x_gpus, y_gpus)
@benchmark multi_gpu_mul!(z_gpus, x_gpus, y_gpus)

  0.001269 seconds (252 CPU allocations: 6.781 KiB)


BenchmarkTools.Trial: 
  memory estimate:  6.78 KiB
  allocs estimate:  252
  --------------
  minimum time:     888.906 μs (0.00% GC)
  median time:      1.003 ms (0.00% GC)
  mean time:        1.028 ms (0.40% GC)
  maximum time:     34.534 ms (57.17% GC)
  --------------
  samples:          4807
  evals/sample:     1

## Tensor contraction experiment

In [15]:
using Random
rng = MersenneTwister(1842)

t1 = randn(rng, Float32, (200, 600, 300))
t2 = randn(rng, Float32, (200, 300))
;

### CPU single threaded approach

In [16]:
t3 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_loop!(t3, t1, t2)
    for k in 1:size(t3)[3]
        for j in 1:size(t3)[2]
            for i in 1:size(t3)[1]
                @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]

                for l in 2:size(t1)[1]
                    @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                end
            end
        end
    end
end

CuArrays.@time calc_t3_loop!(t3, t1, t2)
#@test t3 ≈ t3_2 atol=1e-3
@benchmark calc_t3_loop!(t3, t1, t2)

  9.982802 seconds (45.22 k CPU allocations: 2.268 MiB)


BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.745 s (0.00% GC)
  median time:      9.745 s (0.00% GC)
  mean time:        9.745 s (0.00% GC)
  maximum time:     9.745 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

### CPU single multi-threaded approach

In [17]:
t3_2 = Array{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_loop_threaded!(t3, t1, t2)
    Threads.@threads for k in 1:size(t3)[3]
        for j in 1:size(t3)[2]
            for i in 1:size(t3)[1]
                @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]

                for l in 2:size(t1)[1]
                    @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                end
            end
        end
    end
end

@time calc_t3_loop_threaded!(t3_2, t1, t2)
@test t3 ≈ t3_2 atol=1e-5
@benchmark calc_t3_loop_threaded!(t3_2, t1, t2)

  0.456859 seconds (77.76 k allocations: 3.943 MiB)


BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  1
  --------------
  minimum time:     313.803 ms (0.00% GC)
  median time:      321.278 ms (0.00% GC)
  mean time:        347.443 ms (0.00% GC)
  maximum time:     657.635 ms (0.00% GC)
  --------------
  samples:          15
  evals/sample:     1

### Single GPU

In [18]:
t1cu = CuArray(t1)
t2cu = CuArray(t2)
t3cu = CuArray{Float32}(undef, size(t1)[2], size(t1)[3], size(t2)[2])

function calc_t3_gpu!(t3, t1, t2)
    function kernel!(t3, t1, t2)
        xindex = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        yindex = threadIdx().y + (blockIdx().y - 1)*blockDim().y
        zindex = threadIdx().z + (blockIdx().z - 1)*blockDim().z
        
        xstride = blockDim().x * gridDim().x
        ystride = blockDim().y * gridDim().y
        zstride = blockDim().z * gridDim().z
    
        for k in zindex:zstride:size(t3)[3]
            for j in yindex:ystride:size(t3)[2]
                for i in xindex:xstride:size(t3)[1]
                    @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]
                    
                    for l in 2:size(t1)[1]
                        @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                    end
                end
            end
        end
        
        return nothing
    end

    numthreads = (4,4,4)
    numblocks = map(x -> ceil(Int, x), (size(t3) ./ (4,4,4)))
    CuArrays.@sync begin
        @cuda threads=numthreads blocks=numblocks kernel!(t3, t1, t2)
    end
end

CuArrays.@time calc_t3_gpu!(t3cu, t1cu, t2cu)
#@test t3 ≈ Array(t3cu) atol=1e-3
@benchmark calc_t3_gpu!(t3cu, t1cu, t2cu)

  0.667886 seconds (1.21 M CPU allocations: 58.813 MiB, 2.51% gc time)


BenchmarkTools.Trial: 
  memory estimate:  688 bytes
  allocs estimate:  17
  --------------
  minimum time:     107.233 ms (0.00% GC)
  median time:      108.115 ms (0.00% GC)
  mean time:        108.183 ms (0.00% GC)
  maximum time:     108.637 ms (0.00% GC)
  --------------
  samples:          47
  evals/sample:     1

### Multiple GPUs

In [19]:
function allocgpuarrays(t1, t2, t3, numgpus)
    t1s = Array{CuArray{Float32,3}}(undef,0)
    t2s = Array{CuArray{Float32,2}}(undef,0)
    t3s = Array{CuArray{Float32,3}}(undef,0)
    slabsize = ceil(Int, size(t1)[2]/numgpus)
    for i in 1:numgpus
        indexstart = (i-1)*slabsize + 1
        indexend = min(i*slabsize, size(t1)[2])
        device!(i-1) do
            push!(t1s, CuArray(t1[:,indexstart:indexend,:]))
            push!(t2s, CuArray(t2))
            push!(t3s, CuArray{Float32}(undef, indexend - indexstart + 1, size(t1)[3], size(t2)[2]))
        end
    end
    
    return t1s, t2s, t3s
end

t1s, t2s, t3s = allocgpuarrays(t1, t2, t3, numgpus);

In [20]:
function calc_t3_gpu_nosync!(t3, t1, t2)
    function kernel!(t3, t1, t2)
        xindex = threadIdx().x + (blockIdx().x - 1)*blockDim().x
        yindex = threadIdx().y + (blockIdx().y - 1)*blockDim().y
        zindex = threadIdx().z + (blockIdx().z - 1)*blockDim().z
        
        xstride = blockDim().x * gridDim().x
        ystride = blockDim().y * gridDim().y
        zstride = blockDim().z * gridDim().z
    
        for k in zindex:zstride:size(t3)[3]
            for j in yindex:ystride:size(t3)[2]
                for i in xindex:xstride:size(t3)[1]
                    @inbounds t3[i,j,k] = t1[1,i,j] * t2[1,k]
                    
                    for l in 2:size(t1)[1]
                        @inbounds t3[i,j,k] += t1[l,i,j] * t2[l,k]
                    end
                end
            end
        end
        
        return nothing
    end

    numthreads = (8,4,4)
    numblocks = map(x -> ceil(Int, x), (size(t3) ./ (4,4,4)))
    @cuda threads=numthreads blocks=numblocks kernel!(t3, t1, t2)
end

function calc_t3_gpus!(t3s, t1s, t2s)
    for i in 1:numgpus
        device!(i-1) do
            calc_t3_gpu_nosync!(t3s[i], t1s[i], t2s[i])
        end
    end
    # resynchronize each device (for timing purposes)
    for i in 1:numgpus
        device!(i-1) do
            CuArrays.@sync 1
        end
    end
end

@benchmark calc_t3_gpus!(t3s, t1s, t2s)

BenchmarkTools.Trial: 
  memory estimate:  7.56 KiB
  allocs estimate:  262
  --------------
  minimum time:     12.400 ms (0.00% GC)
  median time:      12.557 ms (0.00% GC)
  mean time:        12.691 ms (0.00% GC)
  maximum time:     18.283 ms (0.00% GC)
  --------------
  samples:          394
  evals/sample:     1