In [1]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda

import matplotlib.pyplot as plt

from timeit import default_timer as timer

###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  running_conf = Struct(backend="numba", signature=True, cache =True, float_precision="single")
  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 0.
  w.face[:] = 0.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()
  return (domain, ne, u, v, w, P)

In [3]:
dim = 2
mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/manapy/mesh/2D/carre.msh"
#mesh_file = "/home/ayoub.hamou/mesh/square.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/gpu_accelerator/functions/square.msh"
domain, ne, u, v, w, P = init(dim=dim, mesh_path=mesh_file)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 105826
Number of Vertices: 53314
Local domain contruction ...


In [None]:
ne.cell.dtype

In [None]:
from numba import cuda

def create_var(v):
  return (v, cuda.to_device(v))

host_w_c, d_w_c = create_var(ne.cell) #:'float[:]'
host_w_ghost, d_w_ghost = create_var(ne.ghost)
host_w_halo, d_w_halo = create_var(ne.halo)
host_w_haloghost, d_w_haloghost = create_var(ne.haloghost)
host_centerc, d_centerc = create_var(ne.domain.cells.center)
host_cellnid, d_cellnid = create_var(ne.domain.cells.cellnid)
host_ghostnid, d_ghostnid = create_var(ne.domain.cells.ghostnid)
host_haloghostnid, d_haloghostnid = create_var(ne.domain.cells.haloghostnid)
host_halonid, d_halonid = create_var(ne.domain.cells.halonid)
host_nodecid, d_nodecid = create_var(ne.domain.cells.nodeid)
host_periodicn, d_periodicn = create_var(ne.domain.cells.periodicnid)
host_periodic, d_periodic = create_var(ne.domain.nodes.periodicid)
host_centergf, d_centergf = create_var(ne.domain.faces.ghostcenter)
host_halocenterg, d_halocenterg = create_var(ne.domain.cells.haloghostcenter)
host_vertexn, d_vertexn = create_var(ne.domain.nodes.vertex)
host_centerh, d_centerh = create_var(ne.domain.halos.centvol)
host_shift, d_shift = create_var(ne.domain.cells.shift)
host_w_x, d_w_x = create_var(ne.gradcellx)
host_w_y, d_w_y = create_var(ne.gradcelly)
host_w_z, d_w_z = create_var(ne.gradcellz)

In [None]:
len(host_w_c)

In [None]:
test_d_w_x = d_w_x.copy_to_host()
plt.plot(host_w_x)
plt.plot(test_d_w_x)
plt.show()

In [None]:
#The original function from functions2d.py

from numpy import  int32, float32, float16, uint32
import numpy as np

def cell_gradient_2d(w_c:'float16[:]', w_ghost:'float16[:]', w_halo:'float16[:]', w_haloghost:'float16[:]',
                     centerc:'float16[:,:]', cellnid:'int32[:,:]', ghostnid:'int32[:,:]', haloghostnid:'int32[:,:]', halonid:'int32[:,:]',
                     nodecid:'uint32[:,:]', periodicn:'int32[:,:]', periodic:'int32[:,:]', centergf:'float16[:,:]', 
                     halocenterg:'float16[:,:]', vertexn:'float16[:,:]', centerh:'float16[:,:]', shift:'float16[:,:]',
                     w_x:'float16[:]', w_y:'float16[:]', w_z:'float16[:]'):
    
    center = np.zeros(3)
    nbelement = len(w_c)
        
    for i in range(nbelement):
        i_xx  = 0.;  i_yy  = 0.; i_xy = 0.
        j_xw = 0.;  j_yw = 0.
        
        for j in range(cellnid[i][-1]):
            cell = cellnid[i][j]
            j_x = centerc[cell][0] - centerc[i][0]
            j_y = centerc[cell][1] - centerc[i][1]
            i_xx += j_x*j_x
            i_yy += j_y*j_y
            i_xy += (j_x * j_y)

            j_xw += (j_x * (w_c[cell] - w_c[i] ))
            j_yw += (j_y * (w_c[cell] - w_c[i] ))
            
        for j in range(ghostnid[i][-1]):
            cell = ghostnid[i][j]
            j_x = centergf[cell][0] - centerc[i][0]
            j_y = centergf[cell][1] - centerc[i][1]
            i_xx += j_x*j_x
            i_yy += j_y*j_y
            i_xy += (j_x * j_y)

            j_xw += (j_x * (w_ghost[cell] - w_c[i] ))
            j_yw += (j_y * (w_ghost[cell] - w_c[i] ))
            

        for k in range(nodecid[i][-1]):
            nod = nodecid[i][k]
            if vertexn[nod][3] == 11 or vertexn[nod][3] == 22:
                for j in range(periodic[nod][-1]):
                    cell = np.int32(periodic[nod][j])
                    center[:] = centerc[cell][0:3]
                    j_x = center[0] + shift[cell][0] - centerc[i][0]
                    j_y = center[1] - centerc[i][1]
                    
                    i_xx += j_x*j_x
                    i_yy += j_y*j_y
                    i_xy += (j_x * j_y)
                    
                    j_xw += (j_x * (w_c[cell] - w_c[i] ))
                    j_yw += (j_y * (w_c[cell] - w_c[i] ))
                    
            if vertexn[nod][3] == 33 or vertexn[nod][3] == 44:
                for j in range(periodic[nod][-1]):
                    cell = np.int32(periodic[nod][j])
                    center[:] = centerc[cell][0:3]
                    j_x = center[0] - centerc[i][0]
                    j_y = center[1] + shift[cell][1] - centerc[i][1]
                    
                    i_xx += j_x*j_x
                    i_yy += j_y*j_y
                    i_xy += (j_x * j_y)
                    
                    j_xw += (j_x * (w_c[cell] - w_c[i] ))
                    j_yw += (j_y * (w_c[cell] - w_c[i] ))
                    

        for j in range(halonid[i][-1]):
            cell = halonid[i][j]
            j_x = centerh[cell][0] - centerc[i][0]
            j_y = centerh[cell][1] - centerc[i][1]
            
            i_xx += j_x*j_x
            i_yy += j_y*j_y
            i_xy += (j_x * j_y)
            
            j_xw += (j_x * (w_halo[cell]  - w_c[i] ))
            j_yw += (j_y * (w_halo[cell]  - w_c[i] ))
                
        for j in range(haloghostnid[i][-1]):
            cell = haloghostnid[i][j]
            center[:] = halocenterg[cell]

            j_x = center[0] - centerc[i][0]
            j_y = center[1] - centerc[i][1]
            
            i_xx += j_x*j_x
            i_yy += j_y*j_y
            i_xy += (j_x * j_y)

            j_xw += (j_x * (w_haloghost[cell] - w_c[i] ))
            j_yw += (j_y * (w_haloghost[cell] - w_c[i] ))
            
            
            
        dia = i_xx * i_yy - i_xy*i_xy

        w_x[i]  = (i_yy * j_xw - i_xy * j_yw) / dia
        w_y[i]  = (i_xx * j_yw - i_xy * j_xw) / dia
        w_z[i]  = 0.



In [None]:
#? using numba jit -> backend.py
#! Using parallel=True yields poor results.
import numba

numba_cell_gradient_2d = numba.jit(cell_gradient_2d, nopython=True, fastmath=True, parallel=False, cache=True)
def cpu_numba_cell_gradient_2d():
  numba_cell_gradient_2d(
    host_w_c,
    host_w_ghost,
    host_w_halo,
    host_w_haloghost,
    host_centerc,
    host_cellnid,
    host_ghostnid,
    host_haloghostnid,
    host_halonid,
    host_nodecid,
    host_periodicn,
    host_periodic,
    host_centergf,
    host_halocenterg,
    host_vertexn,
    host_centerh,
    host_shift,
    host_w_x,
    host_w_y,
    host_w_z
  )


In [None]:
#time taken by cpu_numba_cell_gradient_2d

initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, domain.cells.center, 2.0)

%timeit cpu_numba_cell_gradient_2d()

In [None]:
def kernel_cell_gradient_2d(w_c:'float[:]', w_ghost:'float[:]', w_halo:'float[:]', w_haloghost:'float[:]',
                     centerc:'float[:,:]', cellnid:'int32[:,:]', ghostnid:'int32[:,:]', haloghostnid:'int32[:,:]', halonid:'int32[:,:]',
                     nodecid:'uint32[:,:]', periodicn:'int32[:,:]', periodic:'int32[:,:]', centergf:'float[:,:]', 
                     halocenterg:'float[:,:]', vertexn:'float[:,:]', centerh:'float[:,:]', shift:'float[:,:]',
                     w_x:'float[:]', w_y:'float[:]', w_z:'float[:]', lim:'int32'):
  idx = cuda.grid(1)
  stride = cuda.gridsize(1)

  for i in range(idx, lim, stride):  
    i_xx  = 0.;  i_yy  = 0.; i_xy = 0.
    j_xw = 0.;  j_yw = 0.

    centerc_i_x = centerc[i][0]
    centerc_i_y = centerc[i][1]
    w_c_i = w_c[i]

    for j in range(cellnid[i][-1]):
      cell = cellnid[i][j]
      j_x = centerc[cell][0] - centerc_i_x
      j_y = centerc[cell][1] - centerc_i_y
      i_xx += j_x * j_x
      i_yy += j_y * j_y
      i_xy += (j_x * j_y)

      j_xw += (j_x * (w_c[cell] - w_c_i ))
      j_yw += (j_y * (w_c[cell] - w_c_i ))

    for j in range(ghostnid[i][-1]):
      cell = ghostnid[i][j]
      j_x = centergf[cell][0] - centerc[i][0]
      j_y = centergf[cell][1] - centerc[i][1]
      i_xx += j_x*j_x
      i_yy += j_y*j_y
      i_xy += (j_x * j_y)

      j_xw += (j_x * (w_ghost[cell] - w_c[i] ))
      j_yw += (j_y * (w_ghost[cell] - w_c[i] ))
    
    for k in range(nodecid[i][-1]):
      nod = nodecid[i][k]
      if vertexn[nod][3] == 11 or vertexn[nod][3] == 22:
        for j in range(periodic[nod][-1]):
          cell = np.int32(periodic[nod][j])
          center = centerc[cell][0:3]
          j_x = center[0] + shift[cell][0] - centerc[i][0]
          j_y = center[1] - centerc[i][1]
          
          i_xx += j_x*j_x
          i_yy += j_y*j_y
          i_xy += (j_x * j_y)
          
          j_xw += (j_x * (w_c[cell] - w_c[i] ))
          j_yw += (j_y * (w_c[cell] - w_c[i] ))
              
      if vertexn[nod][3] == 33 or vertexn[nod][3] == 44:
        for j in range(periodic[nod][-1]):
          cell = np.int32(periodic[nod][j])
          center = centerc[cell][0:3]
          j_x = center[0] - centerc[i][0]
          j_y = center[1] + shift[cell][1] - centerc[i][1]
          
          i_xx += j_x*j_x
          i_yy += j_y*j_y
          i_xy += (j_x * j_y)
          
          j_xw += (j_x * (w_c[cell] - w_c[i] ))
          j_yw += (j_y * (w_c[cell] - w_c[i] ))
                
    for j in range(halonid[i][-1]):
      cell = halonid[i][j]
      j_x = centerh[cell][0] - centerc[i][0]
      j_y = centerh[cell][1] - centerc[i][1]
      
      i_xx += j_x*j_x
      i_yy += j_y*j_y
      i_xy += (j_x * j_y)
      
      j_xw += (j_x * (w_halo[cell]  - w_c[i] ))
      j_yw += (j_y * (w_halo[cell]  - w_c[i] ))
            
    for j in range(haloghostnid[i][-1]):
      cell = haloghostnid[i][j]
      center = halocenterg[cell] #!---

      j_x = center[0] - centerc[i][0]
      j_y = center[1] - centerc[i][1]
      
      i_xx += j_x*j_x
      i_yy += j_y*j_y
      i_xy += (j_x * j_y)

      j_xw += (j_x * (w_haloghost[cell] - w_c[i] ))
      j_yw += (j_y * (w_haloghost[cell] - w_c[i] ))
          

    dia = i_xx * i_yy - i_xy * i_xy
    w_x[i]  = (i_yy * j_xw - i_xy * j_yw) / dia
    w_y[i]  = (i_xx * j_yw - i_xy * j_xw) / dia
    w_z[i]  = 0.

import inspect
import hashlib
import imp
import re
from numba import cuda

def get_arg_types(func, float_precision, int_precision):
    # Get the function's argument types
    arg_types = []
    arg_names = inspect.signature(func).parameters.keys()
    for arg_name in arg_names:
        arg_type = inspect.signature(func).parameters[arg_name].annotation
        arg_type = arg_type.replace("float", "float32")
        arg_type = arg_type.replace("uint32", int_precision)
        arg_types.append(arg_type)
    return tuple(arg_types)

  
signature = get_arg_types(kernel_cell_gradient_2d, domain.float_precision, domain.int_precision)
signature = '(' + ', '.join(signature) + ')'
signature = str(signature)
print(signature)
func_gpu = cuda.jit(signature, fastmath=True, device=False,cache=True)(kernel_cell_gradient_2d)



#domain.backend.compile()
#cls._define_halosend = backend.compile(kernel_cell_gradient_2d, signature=True)


def cuda_numba_cell_gradient_2d():
  nb_element = len(d_w_c)
  nb_threads = 32
  nb_blocks = nb_element // 32 + 1


  func_gpu[nb_blocks, nb_threads](
    d_w_c,
    d_w_ghost,
    d_w_halo,
    d_w_haloghost,
    d_centerc,
    d_cellnid,
    d_ghostnid,
    d_haloghostnid,
    d_halonid,
    d_nodecid,
    d_periodicn,
    d_periodic,
    d_centergf,
    d_halocenterg,
    d_vertexn,
    d_centerh,
    d_shift,
    d_w_x,
    d_w_y,
    d_w_z,
    nb_element
  )
  cuda.synchronize()

test_time(700, cuda_numba_cell_gradient_2d)

In [None]:
len(d_w_c)

In [None]:
def verify(a, b):
  b = b.copy_to_host()
  plt.plot(a)
  plt.plot(b)

  #assert
  np.testing.assert_almost_equal(a, b, decimal=2, )
  plt.show()

verify(host_w_x, d_w_x)