In [None]:
using CuArrays
using CUDAnative
using CUDAdrv
using BenchmarkTools
using Test

In [None]:
N = 2^20
x = fill(3.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0

y .*= x             # increment each element of y with the corresponding element of x
x_d = cufill(3.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = cufill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

In [None]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] *= x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(y .== 6.0f0)

In [None]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] *= x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 6.0f0)

In [None]:
@btime sequential_add!($y, $x)

In [None]:
@btime parallel_add!($y, $x)

In [None]:
x_d = cufill(3.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = cufill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

In [None]:
function add_broadcast!(y, x)
    CuArrays.@sync y .*= x
    return
end

@btime add_broadcast!(y_d, x_d)

In [None]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] *= x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 6.0f0)

In [None]:
function bench_gpu1!(y, x)
    CuArrays.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

@btime bench_gpu1!(y_d, x_d)

In [None]:
function gpu_add2!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 6.0f0)

In [None]:
function gpu_mult3!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] *= x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 6.0f0)

In [None]:
function gpu_mult!(z, x, y)
    index = threadIdx().z
    stride = blockDim().z

    for i = index:stride:length(y)
        @inbounds z[i] = x[i]*y[i]
    end
    return nothing
end

In [None]:
function cudamatmul(x,y)
    z = cu(zeros(size(x)[1], size(y)[2]))
    tz = cu(zeros(size(x)[1], 1))
    
    for i = 1:size(x)[1]
        for j = 1:size(y)[2]
            @cuda gpu_mult!(tz, x[i,1:end], y[1:end,j])
            z[i,j] = sum(tz)
        end
    end
    return z
end

In [None]:
a = cu(rand(300, 400))
b = cu(rand(400,1))
@time c = cudamatmul(a,b)
@show c
# c = cu(zeros(300,1))
# c = a[1,:]
# @time @cuda threads=256 gpu_mult!(c, a[1,:], b[:,1])

In [None]:
function bench_gpu2!(y, x)
    CuArrays.@sync begin
        @cuda threads=256 gpu_add2!(y, x)
    end
end

@btime bench_gpu2!(y_d, x_d)

In [None]:
function multiply_large!(a,b)
    CuArrays.@sync begin
        @cuda a*b
    end
    return nothing
end

In [None]:
# c = cu(zeros(300,1))
@btime multiply_large(a,b)
# @show c

In [None]:
acpu = rand(300, 400)
bbpu = rand(400,1)

In [None]:
@time acpu*bbpu

In [None]:
@time a*b