In [17]:
using CUDA
using Test
using BenchmarkTools

In [18]:
CUDA.versioninfo()

CUDA toolkit 11.5, artifact installation
NVIDIA driver 497.29.0, for CUDA 11.5
CUDA driver 11.5

Libraries: 
- CUBLAS: 11.7.4
- CURAND: 10.2.7
- CUFFT: 10.6.0
- CUSOLVER: 11.3.2
- CUSPARSE: 11.7.0
- CUPTI: 16.0.0
- NVML: 11.0.0+495.53
- CUDNN: 8.30.1 (for CUDA 11.5.0)
- CUTENSOR: 1.3.3 (for CUDA 11.4.0)

Toolchain:
- Julia: 1.7.0
- LLVM: 12.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

1 device:
  0: NVIDIA GeForce GTX 1050 (sm_61, 944.301 MiB / 2.000 GiB available)


In [20]:
[CUDA.capability(dev) for dev in CUDA.devices()]

1-element Vector{VersionNumber}:
 v"6.1.0"

## CUDA Simple

In [21]:
x = cu(rand(5,3))
f(x) = 3x^2 + 2x + 1
f(5)

86

## Evaluating CPU Peformance for Baseline

In [38]:
N = 2^20
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0

y .+= x   

@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [39]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [40]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [41]:
@btime sequential_add!($y, $x)

  335.400 μs (0 allocations: 0 bytes)


In [42]:
@btime parallel_add!($y, $x)

  377.800 μs (6 allocations: 544 bytes)


## Now, Evaluating GPU

In [45]:
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N) 

y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(Array(y_d) .== 3.0f0)

In [46]:
function add_broadcast!(y, x)
    CUDA.@sync y .+= x
    return
end

add_broadcast! (generic function with 1 method)

In [47]:
@btime add_broadcast!($y_d, $x_d)

  166.100 μs (23 allocations: 1.61 KiB)


In [48]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(Array(y_d) .== 3.0f0)

In [49]:
function bench_gpu1!(y, x)
    CUDA.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

bench_gpu1! (generic function with 1 method)

In [50]:
@btime bench_gpu1!($y_d, $x_d)

  152.745 ms (17 allocations: 1.00 KiB)


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2), CuModule(Ptr{Nothing} @0x00000000021b93f0, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2)), CuFunction(Ptr{Nothing} @0x00000000075c35d0, CuModule(Ptr{Nothing} @0x00000000021b93f0, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2))), CUDA.KernelState(Ptr{Nothing} @0x00000002037f0000))

In [51]:
function gpu_add3!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return
end

numblocks = ceil(Int, N/256)

fill!(y_d, 2)
@cuda threads=256 blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(Array(y_d) .== 3.0f0)

In [52]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

bench_gpu3! (generic function with 1 method)

In [53]:
@btime bench_gpu3!($y_d, $x_d)

  196.200 μs (3 allocations: 256 bytes)


CUDA.HostKernel{typeof(gpu_add3!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add3!, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2), CuModule(Ptr{Nothing} @0x000000000247ce50, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2)), CuFunction(Ptr{Nothing} @0x0000000008ca6e90, CuModule(Ptr{Nothing} @0x000000000247ce50, CuContext(0x00000000028e3a80, instance 5ab268c59aa3a7a2))), CUDA.KernelState(Ptr{Nothing} @0x00000002037f0000))