In [36]:
using CUDA
using Ipaper
using BenchmarkTools

In [54]:
# Function to compute moving averages using CUDA
function gpu_moving_average(data, window_size)
    n = length(data)
    result = CUDA.zeros(Float32, n - window_size + 1)

    # CUDA kernel to calculate moving average
    function kernel(data, result, window_size, n)
        i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
        if i <= n - window_size + 1
            sum = 0.0f0
            for j = 1:window_size
                sum += data[i + j - 1]
            end
            result[i] = sum / window_size
        end
        nothing
    end
    
    # Launching the kernel
    @cuda threads=1024 blocks=ceil(Int, (n - window_size + 1) / 256) kernel(data, result, window_size, n)
    return result
end

# Function to compute moving averages on CPU
function cpu_moving_average(data, window_size)
    n = length(data)
    result = zeros(Float32, n - window_size + 1)
    # Compute moving average
    @inbounds for i in 1:(n - window_size + 1)
        sum = 0.0f0
        for j = 1:window_size
            sum += data[i + j - 1]
        end
        result[i] = sum / window_size
        # result[i] = sum(data[i:(i + window_size - 1)]) / window_size
    end
    return result
end

cpu_moving_average (generic function with 1 method)

In [56]:
# Example usage
data = CUDA.rand(Float32, 1000000)  # Large array of random floats
# obj_size(data)
window_size = 10
@btime averages_gpu = gpu_moving_average($data, window_size);
# println(averages)  # Print the computed moving averages

  11.100 μs (21 allocations: 768 bytes)


In [57]:
x = Array(data);
obj_size(x);
@btime averages_cpu = cpu_moving_average(x, window_size);

Array(averages_gpu) == averages_cpu # check result

Vector{Float32} | (1000000,) | [34m[1m[4m3.81 Mb[24m[22m[39m
  3.600 ms (2 allocations: 3.81 MiB)


false

In [59]:
3.594*1000 / 11.100 

323.7837837837838

In [53]:
## GPU info
# List properties of all CUDA devices
for device in CUDA.devices()
    @show CUDA.name(device)  # Device name
    @show CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)  # Max threads per block
    @show CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)  # Number of multiprocessors
    @show CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE)  # Warp size
end

CUDA.name(device) = "Quadro P4000"
CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) = 1024
CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) = 14
CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE) = 32
