In [1]:
#Libraries
using Gen
using PyPlot
using CUDA
#using Flux
using Test
using BenchmarkTools;

In [36]:
Threads.nthreads()
has_cuda_gpu()
CUDA.device()
CUDA.maxthreads(CUDA.HostKernel)

MethodError: MethodError: no method matching maxthreads(::Type{CUDA.HostKernel})
Closest candidates are:
  maxthreads(!Matched::CUDA.HostKernel) at C:\Users\jbere\.julia\packages\CUDA\5t6R9\src\compiler\execution.jl:264

In [3]:
N = 2^20
x = fill(1.0f0, N)
y = fill(2.0f0, N)

y .+= x

1048576-element Array{Float32,1}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 ⋮
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [4]:
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [9]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [10]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [11]:
@btime sequential_add!($y, $x)

  253.199 μs (0 allocations: 0 bytes)


In [12]:
@btime parallel_add!($y, $x)

  177.000 μs (23 allocations: 3.34 KiB)


In [14]:
#GPUS
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0
y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

│   caller = ip:0x0
└ @ Core :-1


[32m[1mTest Passed[22m[39m

In [19]:
function add_broadcast!(y, x)
    CUDA.@sync y .+= x
    return
end

@btime add_broadcast!($y_d, $x_d)

  197.200 μs (69 allocations: 1.88 KiB)


In [17]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [18]:
function bench_gpu1!(y, x)
    CUDA.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

@btime bench_gpu1!($y_d, $x_d)

  146.722 ms (47 allocations: 1.23 KiB)


In [20]:
function gpu_add2!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [22]:
function bench_gpu2!(y, x)
    CUDA.@sync begin
        @cuda threads=256 gpu_add2!(y, x)
    end
end

@btime bench_gpu2!($y_d, $x_d)

  1.322 ms (47 allocations: 1.23 KiB)


In [23]:
function gpu_add3!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return
end

numblocks = ceil(Int, N/256)

fill!(y_d, 2)
@cuda threads=256 blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

@btime bench_gpu3!($y_d, $x_d)

  177.900 μs (52 allocations: 1.31 KiB)


In [24]:
function gpu_add2_print!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    @cuprintln("thread $index, block $stride")
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

@cuda threads=16 gpu_add2_print!(y_d, x_d)
synchronize()

thread 1, block 16
thread 2, block 16
thread 3, block 16
thread 4, block 16
thread 5, block 16
thread 6, block 16
thread 7, block 16
thread 8, block 16
thread 9, block 16
thread 10, block 16
thread 11, block 16
thread 12, block 16
thread 13, block 16
thread 14, block 16
thread 15, block 16
thread 16, block 16
