In [17]:
!pip install pycuda

import pycuda.driver as cuda
import pycuda.autoinit

device = cuda.Device(0)
print('S20200010072, S20200010047')
print('\n')
print("Device name: ", device.name())
print("Warp size: ", device.warp_size)
print("Compute capability: ", "%d.%d" % device.compute_capability())
print("Max GPU memory size: ", device.total_memory())
print("Max block dimensions: ", device.max_block_dim_x, device.max_block_dim_y, device.max_block_dim_z)
print("Max grid dimensions: ", device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
S20200010072, S20200010047


Device name:  Tesla T4
Warp size:  32
Compute capability:  7.5
Max GPU memory size:  15835398144
Max block dimensions:  1024 1024 64
Max grid dimensions:  2147483647 65535 65535


In [3]:
import numpy as np
from numba import cuda
import time

@cuda.jit
def prefixSum(data):
    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    if tid >= data.shape[0]:
        return

    for stride in range(1, cuda.blockDim.x):
        index = 2 * stride * (tid + 1) - 1
        if index < data.shape[0]:
            data[index] += data[index - stride]
        cuda.syncthreads()

    for stride in range(cuda.blockDim.x // 2, 0, -1):
        cuda.syncthreads()
        index = 2 * stride * (tid + 1) - 1
        if index + stride < data.shape[0]:
            data[index + stride] += data[index]

def runPrefixSum(T, B, N):
    data = np.random.rand(N).astype(np.float32)

    d_data = cuda.to_device(data)

    start = time.time()

    prefixSum[B, T](d_data)

    end = time.time()

    h_data = d_data.copy_to_host()

    print("T: {}, B: {}, N: {}, Time: {:.6f}s".format(T, B, N, end - start))


print('S20200010072,S20200010047')
T = int(input('Enter T Value: '))
B = int(input('Enter B Value: '))
N = 1000000
print('Time taken is')
runPrefixSum(T,B,N)
print('\n')

# Example usage
runPrefixSum(256, 128, 1000000)
runPrefixSum(512, 64, 1000000)
runPrefixSum(1024, 32, 1000000)


S20200010072,S20200010047
Enter T Value: 256
Enter B Value: 128
Time taken is
T: 256, B: 128, N: 1000000, Time: 0.236923s


T: 256, B: 128, N: 1000000, Time: 0.000129s
T: 512, B: 64, N: 1000000, Time: 0.000686s
T: 1024, B: 32, N: 1000000, Time: 0.000469s


