In [35]:
import numpy as np
from numba import cuda
import math
import time

In [45]:
@cuda.jit(device=True)
def get_bit(input, bit_pos):
  input = input >> bit_pos
  return input & 1

@cuda.jit(device=True)
def get_bitplane(input, bit_pos):
  mask = 1 << bit_pos
  return (input & mask) >> bit_pos

@cuda.jit(device=True)
def adaptive_quantize(input):
  max_val = 15

  if input > max_val:
    # div = torch.pow(torch.ceil(torch.log2(input/max_val)), 2)
    div = 2**(math.ceil(math.log2(input/max_val)))
  else:
    div = 1

  outval = np.int32(round(input/div) * div)
  return outval

def adaptive_quantize_(input):
  max_val = 15

  if input > max_val:
    # div = torch.pow(torch.ceil(torch.log2(input/max_val)), 2)
    div = 2**(math.ceil(math.log2(input/max_val)))
  else:
    div = 1

  outval = np.int32(round(input/div) * div)
  return outval

@cuda.jit(device=True)
def dot_acim(input, weight, in_bw, w_bw, quant=True):
  assert len(input.shape) == 1
  assert len(weight.shape) == 1
  assert input.shape[0] == weight.shape[0]

  out = 0
  for in_bw_ in range(in_bw):
    in_vec_bp = get_bitplane(input, in_bw_)
    for w_bw_ in range(w_bw):
      w_vec_bp = get_bitplane(weight, w_bw_)
      out_value = np.dot(in_vec_bp, w_vec_bp, dtype=np.int32)
      if quant:
        dq_out_value = adaptive_quantize(out_value)
      else:
        dq_out_value = out_value

      if in_bw_ == (in_bw-1): # negative input bit
        dq_out_value = -dq_out_value
      
      if w_bw_ == (w_bw-1): # negative weight bit
        dq_out_value = -dq_out_value
      
      out += (dq_out_value << (in_bw_ + w_bw_))

  return out

@cuda.jit
def gemm_acim(input, weight, out, in_bw, w_bw, quant=True):
  assert len(input.shape) == 2
  assert len(weight.shape) == 2
  assert input.shape[1] == weight.shape[0]

  # out = np.zeros(input.shape[0], weight.shape[1], dtype=torch.int32)
  row = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
  col = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
  if row < input.shape[0] and col < weight.shape[1]:
    out[row, col] += dot_acim(input[row, :], weight[:, col], in_bw, w_bw, quant)
  

@cuda.jit
def call_get_bit(input, output, bit_pos):
  row = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
  if row < input.shape[0]:
    output[row] = get_bit(input[row], bit_pos)

@cuda.jit
def call_get_bitplane(input, output, bit_pos):
  row = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
  if row < input.shape[0]:
    output[row] = get_bitplane(input[row], bit_pos)
  # output[:] = get_bitplane(input, bit_pos)

@cuda.jit
def call_adap_quantize(input, output):
  row = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
  if row < input.shape[0]:
    output[row] = adaptive_quantize(input[row])

In [46]:
N = 16
x = np.arange(N, dtype=np.int32)
y = np.arange(N, dtype=np.int32)
out = np.zeros(N, dtype=np.int32)

# Allocate device memory and copy data to the device
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(out)

# Define the number of threads per block and number of blocks per grid
threads_per_block = 16 
blocks_per_grid = 1

# Launch the kernel
call_get_bit[blocks_per_grid, threads_per_block](d_x, d_out, 1)

# Copy the result back to the host
out = d_out.copy_to_host()

print("x:", x)
print("y:", y)
print("out:", out)

x: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
y: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
out: [0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1]




In [47]:
N = 16
x = np.arange(N, dtype=np.int32)
y = np.arange(N, dtype=np.int32)
out = np.zeros(N, dtype=np.int32)

# Allocate device memory and copy data to the device
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(out)

# Define the number of threads per block and number of blocks per grid
threads_per_block = 16 
blocks_per_grid = 1

# Launch the kernel
call_get_bitplane[blocks_per_grid, threads_per_block](d_x, d_out, 1)

# Copy the result back to the host
out = d_out.copy_to_host()

print("x:", x)
print("y:", y)
print("out:", out)

x: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
y: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
out: [0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1]




In [49]:
N = 1000
x = np.arange(N, dtype=np.int32)
y = np.arange(N, dtype=np.int32)
out = np.zeros(N, dtype=np.int32)

# Allocate device memory and copy data to the device
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(out)

# Define the number of threads per block and number of blocks per grid
threads_per_block = 128 
blocks_per_grid = math.ceil(N/threads_per_block)

# Copy the result back to the host
out = d_out.copy_to_host()

# print("x:", x)
# print("y:", y)
# print("out:", out)

# Measure time for native function
start_time = time.time()
call_adap_quantize[blocks_per_grid, threads_per_block](d_x, d_out)
native_time = time.time() - start_time
print(f"Native function time: {native_time:.6f} seconds")

# Measure time for Numba JIT function
start_time = time.time()
for i in range(N):
  adaptive_quantize_(x[i])
numba_time = time.time() - start_time
print(f"Numba JIT function time: {numba_time:.6f} seconds")


Native function time: 0.000223 seconds
Numba JIT function time: 0.005694 seconds


In [15]:
M = 16
N = 16
K = 16
x = np.arange(M*K, dtype=np.int32).reshape(M, K)
y = np.arange(K*N, dtype=np.int32).reshape(K, N)
out = np.zeros(M*N, dtype=np.int32).reshape(M, N)

# Allocate device memory and copy data to the device
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(out)

# Define the number of threads per block and number of blocks per grid
threads_per_block = (32, 32) 
blocks_per_grid = (math.ceil(M/32), math.ceil(N/32))

# Launch the kernel
gemm_acim[blocks_per_grid, threads_per_block](d_x, d_y, d_out, 4, 4, True)

# Copy the result back to the host
out = d_out.copy_to_host()

print("x:", x)
print("y:", y)
print("out:", out)


TypingError: Failed in cuda mode pipeline (step: nopython frontend)
[1m[1mFailed in cuda mode pipeline (step: nopython frontend)
[1m[1mFailed in cuda mode pipeline (step: nopython frontend)
[1m[1mNo implementation of function Function(<built-in function and_>) found for signature:
 
 >>> and_(array(int32, 1d, C), int64)
 
There are 4 candidate implementations:
[1m      - Of which 2 did not match due to:
      Operator Overload in function 'and_': File: unknown: Line unknown.
        With argument(s): '(array(int32, 1d, C), int64)':[0m
[1m       No match for registered cases:
        * (bool, bool) -> bool
        * (int64, int64) -> int64
        * (int64, uint64) -> int64
        * (uint64, int64) -> int64
        * (uint64, uint64) -> uint64[0m
[1m      - Of which 2 did not match due to:
      Overload of function 'and_': File: numba/experimental/jitclass/overloads.py: Line 0.
        With argument(s): '(array(int32, 1d, C), int64)':[0m
[1m       No match.[0m
[0m
[0m[1mDuring: typing of intrinsic-call at /tmp/ipykernel_2665584/2902190974.py (9)[0m
[1m
File "../../../tmp/ipykernel_2665584/2902190974.py", line 9:[0m
[1m<source missing, REPL/exec in use?>[0m

[0m[1mDuring: resolving callee type: type(CUDADispatcher(<function get_bitplane at 0x7f22a5fd4310>))[0m
[0m[1mDuring: typing of call at /tmp/ipykernel_2665584/2902190974.py (32)
[0m
[1m
File "../../../tmp/ipykernel_2665584/2902190974.py", line 32:[0m
[1m<source missing, REPL/exec in use?>[0m

[0m[1mDuring: resolving callee type: type(CUDADispatcher(<function dot_acim at 0x7f22a5fd6b00>))[0m
[0m[1mDuring: typing of call at /tmp/ipykernel_2665584/2902190974.py (61)
[0m
[1m
File "../../../tmp/ipykernel_2665584/2902190974.py", line 61:[0m
[1m<source missing, REPL/exec in use?>[0m


In [None]:
# Define the CUDA kernel
@cuda.jit(device=True)
def mul(x, y):
    return x * y

@cuda.jit
def mm_kernel(x, y, out):
    idx = cuda.grid(1)
    row = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
    col = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
    if row < x.shape[0] and col < y.shape[1]:
        tmp = 0
        for i in range(x.shape[1]):
            # tmp += x[row, i] * y[i, col]
            tmp += mul(x[row, i], y[i, col])
        out[row, col] = tmp

# Initialize data
N = 16
x = np.arange(N, dtype=np.int32).reshape(4,4)
y = np.arange(N, dtype=np.int32).reshape(4,4)
out = np.zeros(N, dtype=np.int32).reshape(4,4)

# Allocate device memory and copy data to the device
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(out)

# Define the number of threads per block and number of blocks per grid
threads_per_block = (4,4)
blocks_per_grid = (1,1)

# Launch the kernel
mm_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)

# Copy the result back to the host
out = d_out.copy_to_host()

print("x:", x)
print("y:", y)
print("out:", out)


x: [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
y: [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
out: [[ 56  62  68  74]
 [152 174 196 218]
 [248 286 324 362]
 [344 398 452 506]]




In [None]:
np.matmul(x, y)

array([[ 56,  62,  68,  74],
       [152, 174, 196, 218],
       [248, 286, 324, 362],
       [344, 398, 452, 506]], dtype=int32)