In [1]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda

import matplotlib.pyplot as plt

from timeit import default_timer as timer
from manapy.cuda.utils import (VarClass, GPU_Backend)
###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  GPU_Backend.float_precision = 'float32'
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 0.
  w.face[:] = 0.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()
  return (domain, ne, u, v, w, P, running_conf)

In [2]:
from manapy.solvers.advec import AdvectionSolver

dim = 3
mesh_file = "/home/aben-ham/Desktop/work/stage/manapy/mesh/3D/cube.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/manapy/mesh/2D/carre.msh"
#mesh_file = "/home/ayoub.hamou/mesh/square.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/gpu_accelerator/functions/square.msh"
domain, ne, u, v, w, P, conf = init(dim=dim, mesh_path=mesh_file)

S = AdvectionSolver(ne, vel=(u, v), conf=conf)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 4573
Number of Vertices: 1140
Local domain contruction ...


In [3]:
# args list
#? The order is imported of the argument list

VarClass.convert_to_var_class([
    domain.nodes,
    domain.faces,
    domain.cells,
    domain.halos,
    domain,
    ne,
    S,
    u,
    v,
    S.w,
    ne.domain._cells,

])

args = [
  S.var.convective,
  S.var.cell,
  S.var.ghost,
  S.var.halo,
  S.u.face,
  S.v.face,
  S.w.face,
  S.var.gradcellx,
  S.var.gradcelly,
  S.var.gradcellz,
  S.var.gradhalocellx,
  S.var.gradhalocelly,
  S.var.gradhalocellz,
  S.var.psi,
  S.var.psihalo,
  S.domain.cells.center,
  S.domain.faces.center,
  S.domain.halos.centvol,
  S.domain.faces.ghostcenter,
  S.domain.faces.cellid,
  S.domain.faces.normal,
  S.domain.faces.halofid,
  S.domain.faces.name,
  S.domain.innerfaces,
  S.domain.halofaces,
  S.domain.boundaryfaces,
  S.domain.periodicboundaryfaces,
  S.domain.cells.shift,
  S.order,
]

for arg in args:
  print(type(arg))

---------------------------------
---------------------------------
can't get attr _nbnodes => _nbnodes
can't get attr nbnodes => _nbnodes
can't set attr for R_x => can't set attribute
can't set attr for R_y => can't set attribute
can't set attr for R_z => can't set attribute
set att: _R_x
set att: _R_y
set att: _R_z
set att: _cellid
set att: _ghostcenter
set att: _ghostfaceinfo
set att: _ghostid
set att: _haloghostcenter
set att: _haloghostfaceinfo
set att: _haloghostid
set att: _halonid
set att: _lambda_x
set att: _lambda_y
set att: _lambda_z
set att: _loctoglob
set att: _name
set att: _number
set att: _oldname
set att: _periodicid
set att: _vertex
can't set attr for cellid => can't set attribute
can't set attr for ghostcenter => can't set attribute
can't set attr for ghostfaceinfo => can't set attribute
can't set attr for ghostid => can't set attribute
can't set attr for haloghostcenter => can't set attribute
can't set attr for haloghostfaceinfo => can't set attribute
can't set attr

In [4]:
def has_unique_values(arr):
  unique_elements = np.unique(arr)
  unique_len = len(unique_elements)
  origin_len = len(arr)
  print(unique_len, origin_len)
  return unique_len == origin_len

test_arr = args[19][:, 0]
has_unique_values(test_arr)

4308 9874


False

In [5]:
from manapy.solvers.advec.fvm_utils import explicitscheme_convective_3d as cpu_function
from manapy.cuda.manapy.solvers.advec.cuda_fvm_utils import get_kernel_explicitscheme_convective_3d as gpu_function

numba_fun = domain.backend.compile(cpu_function, echo=True, signature=True)
cuda_fun = gpu_function()

compile explicitscheme_convective_3d to cpu => signature=(float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], int32[:,:], float32[:,:], int32[:], int32[:], int32[:], int32[:], int32[:], int32[:], float32[:,:], int32)
compile device_compute_upwind_flux to cuda => signature=void(float32, float32, float32, float32, float32, float32[:], float32[:])
compile kernel_explicitscheme_convective_3d to cuda => signature=void(float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:], float32[:,:], float32[:,:], float32[:,:], float32[:,:], int32[:,:], float32[:,:], int32[:], int32[:], int32[:], int32[:], int32[:], int32[:], float32[:,:], int32)
compile kernel_assign to cuda => signature=void(float32[

In [6]:
numba_fun(*args)
%timeit numba_fun(*args)

464 µs ± 32.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
cuda_fun(*args)
%timeit cuda_fun(*args)

CUDADispatcher(<function get_kernel_explicitscheme_convective_3d.<locals>.kernel_explicitscheme_convective_3d at 0x771ca5027ca0>) is called
VarClass<float32, (4573,)>
VarClass<float32, (4573,)>
VarClass<float32, (9874,)>
VarClass<float32, (0,)>
VarClass<float32, (9874,)>
VarClass<float32, (9874,)>
VarClass<float32, (9874,)>
VarClass<float32, (4573,)>
VarClass<float32, (4573,)>
VarClass<float32, (4573,)>
VarClass<float32, (0,)>
VarClass<float32, (0,)>
VarClass<float32, (0,)>
VarClass<float32, (4573,)>
VarClass<float32, (0,)>
VarClass<float32, (4573, 3)>
VarClass<float32, (9874, 3)>
VarClass<float32, (2, 2)>
VarClass<float32, (9874, 4)>
VarClass<int32, (9874, 2)>
VarClass<float32, (9874, 3)>
VarClass<int32, (9874,)>
VarClass<int32, (9874,)>
VarClass<int32, (8418,)>
VarClass<int32, (0,)>
VarClass<int32, (1456,)>
VarClass<int32, (0,)>
VarClass<float32, (4573, 3)>
<class 'numpy.int32'>




480 µs ± 7.75 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
# verify

def plot(a):
  b = VarClass.to_device(a).copy_to_host()
  plt.plot(a, label="cpu")
  plt.plot(b, label="gpu")
  plt.legend()
  plt.show()

def verify(a, decimal):
  b = VarClass.to_device(a)
  np.testing.assert_almost_equal(a, b, decimal=decimal)


for i, arg in enumerate(args):
  print(f"{i + 1} => test arg {arg}")
  verify(arg, decimal=2)

1 => test arg [-4.6566129e-10 -8.1490725e-10  9.3132257e-10 ...  9.3132257e-10
  0.0000000e+00 -5.9263143e-03]
2 => test arg [1. 1. 1. ... 1. 1. 1.]
3 => test arg [0. 0. 0. ... 0. 0. 0.]
4 => test arg []
5 => test arg [2. 2. 2. ... 2. 2. 2.]
6 => test arg [0. 0. 0. ... 0. 0. 0.]
7 => test arg [0. 0. 0. ... 0. 0. 0.]
8 => test arg [0. 0. 0. ... 0. 0. 0.]
9 => test arg [0. 0. 0. ... 0. 0. 0.]
10 => test arg [0. 0. 0. ... 0. 0. 0.]
11 => test arg []
12 => test arg []
13 => test arg []
14 => test arg [0. 0. 0. ... 0. 0. 0.]
15 => test arg []
16 => test arg [[0.44276604 0.43170595 0.5221834 ]
 [0.7694712  0.24118435 0.90514314]
 [0.7770873  0.31323838 0.8029697 ]
 ...
 [0.92618835 0.5116935  0.9457144 ]
 [0.03945985 0.9010808  0.98169875]
 [0.02115859 0.8817337  0.96096134]]
17 => test arg [[0.         0.02886751 0.45000002]
 [0.         0.54647523 0.03078931]
 [0.         0.97113246 0.55      ]
 ...
 [0.72506887 0.71881676 0.87177306]
 [0.68698937 0.72433424 0.8917232 ]
 [0.77217513 0.7366