This workbook runs the same code several times, and times the result
 - interpreted python
 - compiled cuda python 1 CPU core
 - compiled cuda python all CPU cores
 - compiled cuda python GPU, vectorize
 - compiled cuda python GPU, guvectorize, or njit
 current vector size uses 12 Gb in total
 
 The interpreted python is run on a smaller sample, in order to reduce run times
 the vectorise version can only process 1/10 of the data, as all needs to be stored on the GPU.
 My GPU only as 2Gb memory, so vectorize does not work for this sample data

In [1]:
vec_size = 1000000000

In [2]:
import numpy as np
import resource
from timeit import default_timer as timer

def pow(a, b, c):
  for i in range(a.size):
     c[i] = a[i] ** b[i]

def main():
  vec_div=50
  vec_all = int(vec_size/vec_div)

  a = b = np.array(np.random.sample(vec_all), dtype=np.float32)
  c = np.zeros(vec_all, dtype=np.float32)

  start = timer()
  pow(a, b, c)
  duration = (timer() - start)*vec_div
  
  print(duration)
  usage= resource.getrusage(resource.RUSAGE_SELF)
  print(usage.ru_maxrss)

if __name__ == '__main__':
  main()

463.83404335001615
291968


In [3]:
from numba import cuda
my_gpu=cuda.get_current_device()
gpu = cuda.get_current_device()
print("name = %s" % gpu.name)
print("maxThreadsPerBlock = %s" % str(gpu.MAX_THREADS_PER_BLOCK))
print("maxBlockDimX = %s" % str(gpu.MAX_BLOCK_DIM_X))
print("maxBlockDimY = %s" % str(gpu.MAX_BLOCK_DIM_Y))
print("maxBlockDimZ = %s" % str(gpu.MAX_BLOCK_DIM_Z))
print("maxGridDimX = %s" % str(gpu.MAX_GRID_DIM_X))
print("maxGridDimY = %s" % str(gpu.MAX_GRID_DIM_Y))
print("maxGridDimZ = %s" % str(gpu.MAX_GRID_DIM_Z))
print("maxSharedMemoryPerBlock = %s" % str(gpu.MAX_SHARED_MEMORY_PER_BLOCK))
print("asyncEngineCount = %s" % str(gpu.ASYNC_ENGINE_COUNT))
print("canMapHostMemory = %s" % str(gpu.CAN_MAP_HOST_MEMORY))
print("multiProcessorCount = %s" % str(gpu.MULTIPROCESSOR_COUNT))
print("warpSize = %s" % str(gpu.WARP_SIZE))
print("unifiedAddressing = %s" % str(gpu.UNIFIED_ADDRESSING))
print("pciBusID = %s" % str(gpu.PCI_BUS_ID))
print("pciDeviceID = %s" % str(gpu.PCI_DEVICE_ID))

name = b'Quadro P620'
maxThreadsPerBlock = 1024
maxBlockDimX = 1024
maxBlockDimY = 1024
maxBlockDimZ = 64
maxGridDimX = 2147483647
maxGridDimY = 65535
maxGridDimZ = 65535
maxSharedMemoryPerBlock = 49152
asyncEngineCount = 2
canMapHostMemory = 1
multiProcessorCount = 4
warpSize = 32
unifiedAddressing = 1
pciBusID = 1
pciDeviceID = 0


In [4]:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize

@vectorize(['float32(float32, float32)'], target='parallel')
def powp(a, b):
  return a ** b

@vectorize(['float32(float32, float32)'], target='cpu')
def powc(a, b):
  return a ** b


def main():

  a = b = np.array(np.random.sample(vec_size), dtype=np.float32)
  c = np.zeros(vec_size, dtype=np.float32)

  start = timer()
  c = powp(a, b)
  duration = timer() - start

  print(duration)

  start = timer()
  c = powc(a, b)
  duration = timer() - start

  print(duration)
  usage= resource.getrusage(resource.RUSAGE_SELF)
  print(usage.ru_maxrss)

if __name__ == '__main__':
  main()

3.653465008999774
16.907634236999
11945516


In [5]:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize,guvectorize,jit,njit
from numba import cuda,float32

@vectorize(['float32(float32, float32)'], target='cuda')
def powv(a, b):
  return a ** b

def main():

  vec_div=10
  vec_all = int(vec_size/vec_div)

  a = b = np.array(np.random.sample(vec_all), dtype=np.float32)
  c = np.zeros(vec_all, dtype=np.float32)

  start = timer()
  c = powv(a, b)
  duration = (timer() - start)*vec_div

  print(duration)
  usage= resource.getrusage(resource.RUSAGE_SELF)
  print(usage.ru_maxrss)    

if __name__ == '__main__':
  main()

4.3160974599959445
11945516


In [6]:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize,guvectorize,jit,njit
from numba import cuda,float32

@guvectorize([(float32[:], float32[:], float32[:])], '(n),(n)->(n)')
def powg(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] ** y[i]

#@njit([(float32[:], float32[:], float32[:])],parallel = True,fastmath = True)
@njit([(float32[:], float32[:], float32[:])])
def pown(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] ** y[i]

def main():

  a = b = np.array(np.random.sample(vec_size), dtype=np.float32)
  c = np.zeros(vec_size, dtype=np.float32)

  start = timer()
  powg(a, b,c)
  duration = timer() - start
  print(duration)
  
  start = timer()
  pown(a, b,c)
  duration = timer() - start

  print(duration)
  usage= resource.getrusage(resource.RUSAGE_SELF)
  print(usage.ru_maxrss)    

if __name__ == '__main__':
  main()

16.74205530099971
16.387648851999984
11966080
