In [1]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda

import matplotlib.pyplot as plt

from timeit import default_timer as timer
from manapy.cuda.utils import VarClass
###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  running_conf = Struct(backend="numba", signature=True, cache =True, float_precision="single")
  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 0.
  w.face[:] = 0.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()
  return (domain, ne, u, v, w, P)

In [2]:
dim = 3
mesh_file = "/home/aben-ham/Desktop/work/stage/manapy/mesh/3D/cube.msh"
domain, ne, u, v, w, P = init(dim=dim, mesh_path=mesh_file)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 4573
Number of Vertices: 1140
Local domain contruction ...
---------------------------------
---------------------------------
set att: BCdirichlet
set att: BCneumann
set att: BCneumannNH
set att: cell
set att: dirichletfaces
set att: face
set att: ghost
set att: gradcellx
set att: gradcelly
set att: gradcellz
set att: gradfacex
set att: gradfacey
set att: gradfacez
set att: gradhalocellx
set att: gradhalocelly
set att: gradhalocellz
set att: halo
set att: haloghost
set att: halotosend
set att: neumannNHfaces
set att: neumannfaces
set att: node
set att: psi
set att: psihalo
compile kernel_facetocell to cuda => signature=void(float32[:], float32[:], int32[:,:], int32)
compile kernel_celltoface to cuda => signature=void(float32[:], float32[:], float32[:], float32[:], int32[:,:], int32[:], int32[:], int32[:], int32[:])
compile kernel_define_halosend to cuda => signature=void(float32[:], float32[:], int32[:])
compile kernel_

In [3]:
#PETScKrylovSolver
from manapy.solvers.ls import PETScKrylovSolver

# put this code to PETScKrylovSolver init function so bypass the import of petsc4py
"""
    # try_imports(['import petsc4py',],
    #             'cannot import petsc4py solver!')
    
    # from petsc4py import PETSc as petsc
    
    #self.petsc = petsc
    self.ksp   = None
    
    self.converged_reasons = {}                                                                                                                                                                             
    # for key, val in six.iteritems(petsc.KSP.ConvergedReason.__dict__):                                                                                                                                 
    #     if isinstance(val, int):                                                                                                                                                                       
    #         self.converged_reasons[val] = key    
"""

conf = Struct(reuse_mtx=True, scheme='diamond', verbose=False, precision="single")
L = PETScKrylovSolver(domain=domain, var=P, conf=conf)

SetUp the Linear system ...


In [4]:
# args list
#? The order is imported of the argument list

VarClass.convert_to_var_class([
    domain.nodes,
    domain.faces,
    domain.cells,
    domain.halos,
    ne,
    domain,
    ne.domain._cells,
    L,
    L.var
])

args = [
  L.domain.faces.cellid,
  L.domain.faces.nodeid,
  L.domain.nodes.vertex,
  L.domain.faces.halofid,
  L.domain.halos.halosext,
  L.domain.nodes.oldname,
  L.domain.cells.volume,
  L.domain.nodes.cellid,
  L.domain.cells.center,
  L.domain.halos.centvol,
  L.domain.nodes.halonid,
  L.domain.nodes.periodicid,
  L.domain.nodes.ghostcenter,
  L.domain.nodes.haloghostcenter,
  L.domain.faces.airDiamond,
  L.domain.nodes.lambda_x,
  L.domain.nodes.lambda_y,
  L.domain.nodes.lambda_z,
  L.domain.nodes.number,
  L.domain.nodes.R_x,
  L.domain.nodes.R_y,
  L.domain.nodes.R_z,
  L.domain.faces.param1,
  L.domain.faces.param2,
  L.domain.faces.param3,
  L.domain.faces.param4,
  L.domain.cells.shift,
  L.localsize,
  L.domain.cells.loctoglob,
  L.var.BCdirichlet,
  L._data,
  L._row,
  L._col,
  L.matrixinnerfaces,
  L.domain.halofaces,
  L.var.dirichletfaces,
]



---------------------------------
---------------------------------
can't get attr _nbnodes => _nbnodes
can't get attr nbnodes => _nbnodes
can't set attr for R_x => can't set attribute
can't set attr for R_y => can't set attribute
can't set attr for R_z => can't set attribute
set att: _R_x
set att: _R_y
set att: _R_z
set att: _cellid
set att: _ghostcenter
set att: _ghostfaceinfo
set att: _ghostid
set att: _haloghostcenter
set att: _haloghostfaceinfo
set att: _haloghostid
set att: _halonid
set att: _lambda_x
set att: _lambda_y
set att: _lambda_z
set att: _loctoglob
set att: _name
set att: _number
set att: _oldname
set att: _periodicid
set att: _vertex
can't set attr for cellid => can't set attribute
can't set attr for ghostcenter => can't set attribute
can't set attr for ghostfaceinfo => can't set attribute
can't set attr for ghostid => can't set attribute
can't set attr for haloghostcenter => can't set attribute
can't set attr for haloghostfaceinfo => can't set attribute
can't set attr

In [5]:
import numpy as np
import math
from numba import cuda
from manapy.cuda.utils import (
    VarClass,
    GPU_Backend
)
from manapy.cuda.manapy.util_kernels import (
  kernel_assign,
  device_search_element
)

def get_kernel_get_triplet_3d():
  
  d_cmpt = cuda.device_array(shape=(1), dtype='uint64')
  search_element = GPU_Backend.compile_kernel(device_search_element, device = True)

  def kernel_get_triplet_3d(
    cellfid:'int32[:,:]',
    nodeidf:'int32[:,:]',
    vertexn:'float[:,:]',
    halofid:'int32[:]',
    haloext:'int32[:,:]',
    oldnamen:'uint32[:]',
    volume:'float[:]',
    cellnid:'int32[:,:]',
    centerc:'float[:,:]',
    centerh:'float[:,:]',
    halonid:'int32[:,:]',
    periodicnid:'int32[:,:]',
    centergn:'float[:,:,:]',
    halocentergn:'float[:,:,:]',
    airDiamond:'float[:]',
    lambda_x:'float[:]',
    lambda_y:'float[:]',
    lambda_z:'float[:]',
    number:'uint32[:]',
    R_x:'float[:]',
    R_y:'float[:]',
    R_z:'float[:]',
    param1:'float[:]',
    param2:'float[:]',
    param3:'float[:]',
    param4:'float[:]',
    shift:'float[:,:]',
    nbelements:'int32',
    loctoglob:'int32[:]',
    BCdirichlet:'uint32[:]',
    a_loc:'float[:]',
    irn_loc:'int32[:]',
    jcn_loc:'int32[:]',
    matrixinnerfaces:'uint32[:]',
    halofaces:'uint32[:]',
    dirichletfaces:'uint32[:]',
    s_cmpt: 'uint64[:]'
    ):

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    parameters = cuda.local.array(4, param1.dtype)
    nodes = cuda.local.array(4, nodeidf.dtype)

    # cmpt = 0
    
    for idx in range(start, len(matrixinnerfaces), stride):
      i = matrixinnerfaces[idx]

      c_left = cellfid[i][0]
      c_leftglob  = loctoglob[c_left]
      
      nodes[0] = nodeidf[i][0]
      nodes[1] = nodeidf[i][1]
      nodes[2] = nodeidf[i][2]
      nodes[3] = nodeidf[i][2]
      if nodeidf[i][-1] == 4:
        nodes[3] = nodeidf[i][3]

      parameters[0] = param1[i]
      parameters[1] = param2[i]
      parameters[2] = -1. * param1[i]
      parameters[3] = -1. * param2[i]
      
      c_right = cellfid[i][1]
      c_rightglob = loctoglob[c_right]
      
      cmpt = cuda.atomic.add(s_cmpt, 0, 1)
      irn_loc[cmpt] = c_leftglob
      jcn_loc[cmpt] = c_leftglob
      value = -1 * param3[i] / volume[c_left]
      a_loc[cmpt] = value
      
      cmptparam = 0

      for nod in nodes:
        if search_element(BCdirichlet, oldnamen[nod]) == 0: 
          for j in range(cellnid[nod][-1]):
            center = centerc[cellnid[nod][j]]
            xdiff = center[0] - vertexn[nod][0]
            ydiff = center[1] - vertexn[nod][1]
            zdiff = center[2] - vertexn[nod][2]
            alpha = (1. + lambda_x[nod]*xdiff + \
                      lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                  lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
            value = alpha / volume[c_left] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_leftglob
            jcn_loc[cmpt] = loctoglob[cellnid[nod][j]]
            a_loc[cmpt] = value
            
            #right cell-----------------------------------                                                                                              
            value = -1. * alpha / volume[c_right] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_rightglob
            jcn_loc[cmpt] = loctoglob[cellnid[nod][j]]
            a_loc[cmpt] = value
              
          for j in range(halonid[nod][-1]):
            center = centerh[halonid[nod][j]]
            xdiff = center[0] - vertexn[nod][0]
            ydiff = center[1] - vertexn[nod][1]
            zdiff = center[2] - vertexn[nod][2]
            alpha = (1. + lambda_x[nod]*xdiff + \
                      lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                  lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
            value = alpha / volume[c_left] * parameters[cmptparam]
            
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_leftglob
            jcn_loc[cmpt] = haloext[halonid[nod][j]][0]
            a_loc[cmpt] = value

            #right cell-----------------------------------                                                                                              
            value = -1. * alpha / volume[c_right] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_rightglob
            jcn_loc[cmpt] = haloext[halonid[nod][j]][0]
            a_loc[cmpt] = value
                
          for j in range(periodicnid[nod][-1]):
            if vertexn[nod][3] == 11 or vertexn[nod][3] == 22:
              center[0] = centerc[periodicnid[nod][j]][0]  + shift[periodicnid[nod][j]][0]
              center[1] = centerc[periodicnid[nod][j]][1]  
              center[2] = centerc[periodicnid[nod][j]][2]
            if vertexn[nod][3] == 33 or vertexn[nod][3] == 44:
              center[0] = centerc[periodicnid[nod][j]][0]  
              center[1] = centerc[periodicnid[nod][j]][1]  + shift[periodicnid[nod][j]][1]
              center[2] = centerc[periodicnid[nod][j]][2]
            if vertexn[nod][3] == 55 or vertexn[nod][3] == 66:
              center[0] = centerc[periodicnid[nod][j]][0]  
              center[1] = centerc[periodicnid[nod][j]][1]  
              center[2] = centerc[periodicnid[nod][j]][2] + shift[periodicnid[nod][j]][2]
            
            xdiff = center[0] - vertexn[nod][0]
            ydiff = center[1] - vertexn[nod][1]
            zdiff = center[2] - vertexn[nod][2]
            alpha = (1. + lambda_x[nod]*xdiff + \
                      lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                  lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
            value =  alpha / volume[c_left] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_leftglob
            jcn_loc[cmpt] = loctoglob[periodicnid[nod][j]]
            a_loc[cmpt] = value
            #right cell-----------------------------------                                                                                              
            value =  -1. * alpha / volume[c_right] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_rightglob
            jcn_loc[cmpt] = loctoglob[periodicnid[nod][j]]
            a_loc[cmpt] = value
              
          for j in range(len(centergn[nod])):
            if centergn[nod][j][-1] != -1:
              center = centergn[nod][j][0:3]
              xdiff = center[0] - vertexn[nod][0]
              ydiff = center[1] - vertexn[nod][1]
              zdiff = center[2] - vertexn[nod][2]
              alpha = (1. + lambda_x[nod]*xdiff + \
                        lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                    lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
              value = alpha / volume[c_left] * parameters[cmptparam]
              
              index = np.int32(centergn[nod][j][3])
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_leftglob
              jcn_loc[cmpt] = loctoglob[index]
              a_loc[cmpt] = value

              #right cell-----------------------------------                                                                                              
              value = -1. * alpha / volume[c_right] * parameters[cmptparam]
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_rightglob
              jcn_loc[cmpt] = loctoglob[index]
              a_loc[cmpt] = value
              
          for j in range(len(halocentergn[nod])):
            if halocentergn[nod][j][-1] != -1:
              center = halocentergn[nod][j][0:3]
              xdiff = center[0] - vertexn[nod][0]
              ydiff = center[1] - vertexn[nod][1]
              zdiff = center[2] - vertexn[nod][2]
              alpha = (1. + lambda_x[nod]*xdiff + \
                        lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                    lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
              value = alpha / volume[c_left] * parameters[cmptparam]
              index = np.int32(halocentergn[nod][j][3])
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_leftglob
              jcn_loc[cmpt] = haloext[index][0]
              a_loc[cmpt] = value

              #right cell-----------------------------------                                                                                              
              value = -1. * alpha / volume[c_right] * parameters[cmptparam]
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_rightglob
              jcn_loc[cmpt] = haloext[index][0]
              a_loc[cmpt] = value

        
        cmptparam = cmptparam +1
        
        cmpt = cuda.atomic.add(s_cmpt, 0, 1)
        irn_loc[cmpt] = c_leftglob
        jcn_loc[cmpt] = c_rightglob
        value = param3[i] / volume[c_left]
        a_loc[cmpt] = value
       

        # right cell------------------------------------------------------
        cmpt = cuda.atomic.add(s_cmpt, 0, 1)
        irn_loc[cmpt] = c_rightglob
        jcn_loc[cmpt] = c_leftglob
        value = param3[i] / volume[c_right]
        a_loc[cmpt] = value
       
        cmpt = cuda.atomic.add(s_cmpt, 0, 1)
        irn_loc[cmpt] = c_rightglob
        jcn_loc[cmpt] = c_rightglob
        value = -1. * param3[i] / volume[c_right]
        a_loc[cmpt] = value
       
    for idx in range(start, len(halofaces), stride):
      i = halofaces[idx]
      
      c_left = cellfid[i][0]
      c_leftglob  = loctoglob[c_left]
      
      nodes[0] = nodeidf[i][0]
      nodes[1] = nodeidf[i][1]
      nodes[2] = nodeidf[i][2]
      nodes[3]  = nodeidf[i][2]
      if nodeidf[i][-1] == 4:
        nodes[3] = nodeidf[i][3]

      parameters[0] = param1[i]
      parameters[1] = param2[i]
      parameters[2] = -1. * param1[i]
      parameters[3] = -1. * param2[i]
      
      c_rightglob = haloext[halofid[i]][0]
      c_right     = halofid[i]
      
      cmptparam = 0
      for nod in nodes:
        if search_element(BCdirichlet, oldnamen[nod]) == 0: 
          for j in range(cellnid[nod][-1]):
            center = centerc[cellnid[nod][j]]
            xdiff = center[0] - vertexn[nod][0]
            ydiff = center[1] - vertexn[nod][1]
            zdiff = center[2] - vertexn[nod][2]
            alpha = (1. + lambda_x[nod]*xdiff + \
                      lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                  lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
            value = alpha / volume[c_left] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_leftglob
            jcn_loc[cmpt] = loctoglob[cellnid[nod][j]]
            a_loc[cmpt] = value
          
          for j in range(halonid[nod][-1]):
            center = centerh[halonid[nod][j]]
            xdiff = center[0] - vertexn[nod][0]
            ydiff = center[1] - vertexn[nod][1]
            zdiff = center[2] - vertexn[nod][2]
            alpha = (1. + lambda_x[nod]*xdiff + \
                      lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                  lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
            value = alpha / volume[c_left] * parameters[cmptparam]
            cmpt = cuda.atomic.add(s_cmpt, 0, 1)
            irn_loc[cmpt] = c_leftglob
            jcn_loc[cmpt] = haloext[halonid[nod][j]][0]
            a_loc[cmpt] = value
              
          for j in range(len(centergn[nod])):
            if centergn[nod][j][-1] != -1:
              center = centergn[nod][j][0:3]
              xdiff = center[0] - vertexn[nod][0]
              ydiff = center[1] - vertexn[nod][1]
              zdiff = center[2] - vertexn[nod][2]
              alpha = (1. + lambda_x[nod]*xdiff + \
                        lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                    lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
              value = alpha / volume[c_left] * parameters[cmptparam]
              
              index = np.int32(centergn[nod][j][3])
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_leftglob
              jcn_loc[cmpt] = loctoglob[index]
              a_loc[cmpt] = value
              
          for j in range(len(halocentergn[nod])):
            if halocentergn[nod][j][-1] != -1:
              center = halocentergn[nod][j][0:3]
              xdiff = center[0] - vertexn[nod][0]
              ydiff = center[1] - vertexn[nod][1]
              zdiff = center[2] - vertexn[nod][2]
              alpha = (1. + lambda_x[nod]*xdiff + \
                        lambda_y[nod]*ydiff + lambda_z[nod]*zdiff)/ (number[nod] + lambda_x[nod]*R_x[nod] + \
                                                                    lambda_y[nod]*R_y[nod] + lambda_z[nod]*R_z[nod])
              value = alpha / volume[c_left] * parameters[cmptparam]
              index = np.int32(halocentergn[nod][j][3])
              cmpt = cuda.atomic.add(s_cmpt, 0, 1)
              irn_loc[cmpt] = c_leftglob
              jcn_loc[cmpt] = haloext[index][0]
              a_loc[cmpt] = value

        cmptparam = cmptparam +1
      
      cmpt = cuda.atomic.add(s_cmpt, 0, 1)
      irn_loc[cmpt] = c_leftglob
      jcn_loc[cmpt] = c_leftglob
      value = -1 * param3[i] / volume[c_left]
      a_loc[cmpt] = value
      
      cmpt = cuda.atomic.add(s_cmpt, 0, 1)
      irn_loc[cmpt] = c_leftglob
      jcn_loc[cmpt] = c_rightglob
      value = param3[i] / volume[c_left]
      a_loc[cmpt] = value
            
    for idx in range(start, len(dirichletfaces), stride):
      i = dirichletfaces[idx]

      c_left = cellfid[i][0]
      c_leftglob  = loctoglob[c_left]
      
      parameters[0] = param1[i]
      parameters[1] = param2[i]
      parameters[2] = -1. * param1[i]
      parameters[3] = -1. * param2[i]
      
      cmpt = cuda.atomic.add(s_cmpt, 0, 1)
      irn_loc[cmpt] = c_leftglob
      jcn_loc[cmpt] = c_leftglob
      value = -1 * param3[i] / volume[c_left]
      a_loc[cmpt] = value
      
      cmpt = cuda.atomic.add(s_cmpt, 0, 1)
      irn_loc[cmpt] = c_leftglob
      jcn_loc[cmpt] = c_leftglob
      value = -1. * param3[i] / volume[c_left]
      a_loc[cmpt] = value
            

  kernel_get_triplet_3d = GPU_Backend.compile_kernel(kernel_get_triplet_3d)
  kernel_assign_int64 = cuda.jit('void(uint64[:], uint64)')(kernel_assign)
  
  def result(*args):
    VarClass.debug(kernel_get_triplet_3d, args)
    args = [VarClass.to_device(arg) for arg in args]
    kernel_assign_int64[1, 1](d_cmpt, 0) #cmpt
    cuda.synchronize()
    # matrixinnerfaces halofaces dirichletfaces
    size = max(len(args[33]), len(args[34]), len(args[35]))
    nb_blocks, nb_threads = GPU_Backend.get_gpu_prams(size)
    kernel_get_triplet_3d[nb_blocks, nb_threads](*args, d_cmpt)
    cuda.synchronize()

  return result



In [6]:
L._data.shape

(1641832,)

In [7]:
from manapy.ast.functions3d import get_triplet_3d as cpu_function
from manapy.cuda.manapy.ast.cuda_functions3d import get_kernel_get_triplet_3d as gpu_function

numba_fun = domain.backend.compile(cpu_function, echo=True, signature=True)
cuda_fun = gpu_function()
#cuda_fun = get_kernel_get_triplet_3d()

compile get_triplet_3d to cpu => signature=(int32[:,:], int32[:,:], float32[:,:], int32[:], int32[:,:], int32[:], float32[:], int32[:,:], float32[:,:], float32[:,:], int32[:,:], int32[:,:], float32[:,:,:], float32[:,:,:], float32[:], float32[:], float32[:], float32[:], int32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:,:], int32, int32[:], int32[:], float32[:], int32[:], int32[:], int32[:], int32[:], int32[:])
compile device_search_element to cuda => signature=int32(int32[:], int32)
compile kernel_get_triplet_3d to cuda => signature=void(int32[:,:], int32[:,:], float32[:,:], int32[:], int32[:,:], int32[:], float32[:], int32[:,:], float32[:,:], float32[:,:], int32[:,:], int32[:,:], float32[:,:,:], float32[:,:,:], float32[:], float32[:], float32[:], float32[:], int32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:,:], int32, int32[:], int32[:], float32[:], int32[:], int32[:], int32[:], 

In [13]:
numba_fun(*args)
%timeit numba_fun(*args)

36.6 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
cuda_fun(*args)
%timeit cuda_fun(*args)

7.25 ms ± 61.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
# verify

gpu_res = VarClass.to_device(args[30]).copy_to_host() + VarClass.to_device(args[31]).copy_to_host() + VarClass.to_device(args[32]).copy_to_host()
cpu_res = args[30] + args[31] + args[32]
print(np.sum(cpu_res), " ==? ", np.sum(gpu_res))
gpu_res.sort()
cpu_res.sort()

np.testing.assert_almost_equal(gpu_res, cpu_res, decimal=1)


7042259336.000943  ==?  7042259336.05096
