In [1]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda

import matplotlib.pyplot as plt

from timeit import default_timer as timer
from manapy.cuda.utils import VarClass
###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  running_conf = Struct(backend="numba", signature=True, cache =True, float_precision="single")
  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 0.
  w.face[:] = 0.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()
  return (domain, ne, u, v, w, P)

In [15]:
dim = 3
mesh_file = "/home/aben-ham/Desktop/work/stage/manapy/mesh/3D/cube.msh"
domain, ne, u, v, w, P = init(dim=dim, mesh_path=mesh_file)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 4573
Number of Vertices: 1140
Local domain contruction ...
---------------------------------
---------------------------------
set att: BCdirichlet
set att: BCneumann
set att: BCneumannNH
set att: cell
set att: dirichletfaces
set att: face
set att: ghost
set att: gradcellx
set att: gradcelly
set att: gradcellz
set att: gradfacex
set att: gradfacey
set att: gradfacez
set att: gradhalocellx
set att: gradhalocelly
set att: gradhalocellz
set att: halo
set att: haloghost
set att: halotosend
set att: neumannNHfaces
set att: neumannfaces
set att: node
set att: psi
set att: psihalo
---------------------------------
---------------------------------
set att: BCdirichlet
set att: BCneumann
set att: BCneumannNH
set att: cell
set att: dirichletfaces
set att: face
set att: ghost
set att: gradcellx
set att: gradcelly
set att: gradcellz
set att: gradfacex
set att: gradfacey
set att: gradfacez
set att: gradhalocellx
set att: gradhalocel

In [16]:
#PETScKrylovSolver
from manapy.solvers.ls import PETScKrylovSolver

# put this code to PETScKrylovSolver init function so bypass the import of petsc4py
"""
    # try_imports(['import petsc4py',],
    #             'cannot import petsc4py solver!')
    
    # from petsc4py import PETSc as petsc
    
    #self.petsc = petsc
    self.ksp   = None
    
    self.converged_reasons = {}                                                                                                                                                                             
    # for key, val in six.iteritems(petsc.KSP.ConvergedReason.__dict__):                                                                                                                                 
    #     if isinstance(val, int):                                                                                                                                                                       
    #         self.converged_reasons[val] = key    
"""

conf = Struct(reuse_mtx=True, scheme='diamond', verbose=False, precision="single")
L = PETScKrylovSolver(domain=domain, var=P, conf=conf)

SetUp the Linear system ...


In [17]:
# args list
#? The order is imported of the argument list

L.update_ghost_values()
L.var.BCdirichlet = L.domain.nodes.oldname

VarClass.convert_to_var_class([
    domain.nodes,
    domain.faces,
    domain.cells,
    domain.halos,
    ne,
    domain,
    ne.domain._cells,
    L,
    L.var
])

args = [
  L.var.cell,
  L.var.ghost,
  L.var.halo,
  L.var.node,
  L.domain.faces.cellid,
  L.domain.faces.nodeid,
  L.domain.faces.ghostcenter,
  L.domain.faces.halofid,
  L.domain.cells.center,
  L.domain.halos.centvol,
  L.domain.nodes.oldname,
  L.domain.faces.airDiamond,
  L.domain.faces.f_1,
  L.domain.faces.f_2,
  L.domain.faces.f_3,
  L.domain.faces.f_4,
  L.domain.faces.normal,
  L.domain.cells.shift,
  L.domain.Pbordnode,
  L.domain.Pbordface,
  L.var.gradfacex,
  L.var.gradfacey,
  L.var.gradfacez,
  L.var.BCdirichlet,
  L.domain.innerfaces,
  L.domain.halofaces,
  L.var.neumannfaces,
  L.var.dirichletfaces,
  L.domain.periodicboundaryfaces,
]



---------------------------------
---------------------------------
can't get attr _nbnodes => _nbnodes
can't get attr nbnodes => _nbnodes
can't set attr for R_x => can't set attribute
can't set attr for R_y => can't set attribute
can't set attr for R_z => can't set attribute
set att: _R_x
set att: _R_y
set att: _R_z
set att: _cellid
set att: _ghostcenter
set att: _ghostfaceinfo
set att: _ghostid
set att: _haloghostcenter
set att: _haloghostfaceinfo
set att: _haloghostid
set att: _halonid
set att: _lambda_x
set att: _lambda_y
set att: _lambda_z
set att: _loctoglob
set att: _name
set att: _number
set att: _oldname
set att: _periodicid
set att: _vertex
can't set attr for cellid => can't set attribute
can't set attr for ghostcenter => can't set attribute
can't set attr for ghostfaceinfo => can't set attribute
can't set attr for ghostid => can't set attribute
can't set attr for haloghostcenter => can't set attribute
can't set attr for haloghostfaceinfo => can't set attribute
can't set attr

In [18]:
L.var.gradfacex
#L.var.gradfacey
#L.var.gradfacez

VarClass([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [6]:
from manapy.ast.functions3d import compute_P_gradient_3d_diamond as cpu_function
from manapy.cuda.manapy.ast.cuda_functions3d import get_kernel_compute_P_gradient_3d_diamond as gpu_function

numba_fun = domain.backend.compile(cpu_function, echo=True, signature=True)
cuda_fun = gpu_function()
#cuda_fun = get_kernel_get_triplet_3d()

compile compute_P_gradient_3d_diamond to cpu => signature=(float32[:], float32[:], float32[:], float32[:], int32[:,:], int32[:,:], float32[:,:], int32[:], float32[:,:], float32[:,:], int32[:], float32[:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:], float32[:], float32[:], float32[:], float32[:], int32[:], int32[:], int32[:], int32[:], int32[:], int32[:])
compile device_search_element to cuda => signature=int32(int32[:], int32)
compile kernel_compute_P_gradient_3d_diamond to cuda => signature=void(float32[:], float32[:], float32[:], float32[:], int32[:,:], int32[:,:], float32[:,:], int32[:], float32[:,:], float32[:,:], int32[:], float32[:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], float32[:], float32[:], float32[:], float32[:], float32[:], int32[:], int32[:], int32[:], int32[:], int32[:], int32[:])


In [7]:
numba_fun(*args)
%timeit numba_fun(*args)

11.7 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
cuda_fun(*args)
%timeit cuda_fun(*args)

CUDADispatcher(<function get_kernel_compute_P_gradient_3d_diamond.<locals>.kernel_compute_P_gradient_3d_diamond at 0x7117787e7dc0>) is called
1 => VarClass<float32, (4573,)>
2 => VarClass<float32, (9874,)>
3 => VarClass<float32, (0,)>
4 => VarClass<float32, (1140,)>
5 => VarClass<int32, (9874, 2)>
6 => VarClass<int32, (9874, 4)>
7 => VarClass<float32, (9874, 4)>
8 => VarClass<int32, (9874,)>
9 => VarClass<float32, (4573, 3)>
10 => VarClass<float32, (2, 2)>
11 => VarClass<int32, (1140,)>
12 => VarClass<float32, (9874,)>
13 => VarClass<float32, (9874, 3)>
14 => VarClass<float32, (9874, 3)>
15 => VarClass<float32, (9874, 3)>
16 => VarClass<float32, (9874, 3)>
17 => VarClass<float32, (9874, 3)>
18 => VarClass<float32, (4573, 3)>
19 => VarClass<float32, (1140,)>
20 => VarClass<float32, (9874,)>
21 => VarClass<float32, (9874,)>
22 => VarClass<float32, (9874,)>
23 => VarClass<float32, (9874,)>
24 => VarClass<int32, (1140,)>
25 => VarClass<int32, (8418,)>
26 => VarClass<int32, (0,)>
27 => VarC



8.64 ms ± 31.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
# verify

def verify(a, decimal, plot):
  b = VarClass.to_device(a)
  if plot == True:
    plt.plot(a, label="cpu")
    plt.plot(b, label="gpu")
    plt.legend()
    plt.show()
  np.testing.assert_almost_equal(a, b, decimal=decimal)


for i, arg in enumerate(args):
  print(f"{i + 1} => test arg {arg}")
  verify(arg, decimal=2, plot=False)


1 => test arg [ 0.11446792 -0.53894246 -0.55417454 ... -0.8523767   0.9210803
  0.95768285]
2 => test arg [0. 0. 0. ... 0. 0. 0.]
3 => test arg []
4 => test arg [0. 0. 0. ... 0. 0. 0.]
5 => test arg [[4197   -1]
 [4243   -1]
 [4215   -1]
 ...
 [1361 2321]
 [1342 1361]
 [2321 3600]]
6 => test arg [[ 116   40   39    3]
 [ 117   49   48    3]
 [ 118   76   75    3]
 ...
 [1139  830 1083    3]
 [1139  920 1083    3]
 [1139 1024 1083    3]]
7 => test arg [[-0.02165063  0.02165063  0.45        0.02165063]
 [-0.02165063  0.5473564   0.02309198  0.02165063]
 [-0.02165052  0.9783493   0.5500002   0.02165052]
 ...
 [-1.         -1.         -1.         -1.        ]
 [-1.         -1.         -1.         -1.        ]
 [-1.         -1.         -1.         -1.        ]]
8 => test arg [0 0 0 ... 0 0 0]
9 => test arg [[0.44276604 0.43170595 0.5221834 ]
 [0.7694712  0.24118435 0.90514314]
 [0.7770873  0.31323838 0.8029697 ]
 ...
 [0.92618835 0.5116935  0.9457144 ]
 [0.03945985 0.9010808  0.98169875]
 [