In [2]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda

import matplotlib.pyplot as plt

from timeit import default_timer as timer

###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  running_conf = Struct(backend="numba", signature=True, cache =True, float_precision="single")
  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 0.
  w.face[:] = 0.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()
  return (domain, ne, u, v, w, P)

In [3]:
dim = 2
mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/manapy/mesh/2D/carre.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/manapy/mesh/2D/carre_hybrid.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/gpu_accelerator/functions/square_larger.msh"
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/gpu_accelerator/functions/square.msh"
#mesh_file = "/home/ayoub.hamou/mesh/square.msh"
#mesh_file = "/home/ayoub.hamou/mesh/square_larger.msh"
domain, ne, u, v, w, P = init(dim=dim, mesh_path=mesh_file)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 105826
Number of Vertices: 53314
dfgdf
Local domain contruction ...


In [4]:
from manapy.solvers.ls import PETScKrylovSolver

# put this code to PETScKrylovSolver init function so bypass the import of petsc4py
"""
    # try_imports(['import petsc4py',],
    #             'cannot import petsc4py solver!')
    
    # from petsc4py import PETSc as petsc
    
    #self.petsc = petsc
    self.ksp   = None
    
    self.converged_reasons = {}                                                                                                                                                                             
    # for key, val in six.iteritems(petsc.KSP.ConvergedReason.__dict__):                                                                                                                                 
    #     if isinstance(val, int):                                                                                                                                                                       
    #         self.converged_reasons[val] = key    
"""

conf = Struct(reuse_mtx=True, scheme='diamond', verbose=False, precision="single")
L = PETScKrylovSolver(domain=domain, var=P, conf=conf)

SetUp the Linear system ...


In [5]:
from numba import cuda

def create_var(v):
  return (v, cuda.to_device(v))

host_nodeidf, d_nodeidf = create_var(L.domain.faces.nodeid)
host_halofid, d_halofid = create_var(L.domain.faces.halofid)
host_cellnid, d_cellnid = create_var(L.domain.nodes.cellid)
host_halonid, d_halonid = create_var(L.domain.nodes.halonid)
host_centergn, d_centergn = create_var(L.domain.nodes.periodicid)
host_halocentergn, d_halocentergn = create_var(L.domain.nodes.ghostcenter)
host_haloghostcenter, d_haloghostcenter = create_var(L.domain.nodes.haloghostcenter)
host_oldnamen, d_oldnamen = create_var(L.domain.nodes.oldname)
host_BCdirichlet, d_BCdirichlet = create_var(L.var.BCdirichlet)
host_matrixinnerfaces, d_matrixinnerfaces = create_var(L.matrixinnerfaces)
host_halofaces, d_halofaces = create_var(L.domain.halofaces)
host_dirichletfaces, d_dirichletfaces = create_var(L.var.dirichletfaces)


In [6]:
#The original function from functions2d.py

from numpy import  int32, float32, uint32
import numpy as np


def compute_2dmatrix_size(nodeidf:'int32[:,:]', halofid:'int32[:]', cellnid:'int32[:,:]',  halonid:'int32[:,:]', periodicnid:'int32[:,:]', 
                        centergn:'float[:,:,:]', halocentergn:'float[:,:,:]', oldnamen:'uint32[:]', BCdirichlet:'uint32[:]', 
                        matrixinnerfaces:'uint32[:]', halofaces:'uint32[:]', 
                        dirichletfaces:'uint32[:]'):                                                                                                                                                                       
    
    def search_element(a:'int32[:]', target_value:'int32'):
        find = 0
        for val in a:
            if val == target_value:
                find = 1
                break
        return find
    cmpt = 0
    for i in matrixinnerfaces:
        cmpt = cmpt + 1
        
        for nod in nodeidf[i][:nodeidf[i][-1]]:
            if search_element(BCdirichlet, oldnamen[nod]) == 0:# and search_element(BCneumannNH, oldnamen[nod]) == 0:
            # if vertexn[nod][3] not in BCdirichlet:
                for j in range(cellnid[nod][-1]):
                    
                    cmpt = cmpt + 1
                    #right cell-----------------------------------                                                                                              
                    cmpt = cmpt + 1
                
                for j in range(len(centergn[nod])):
                    if centergn[nod][j][-1] != -1:
                       
                        cmpt = cmpt + 1
                        #right cell-----------------------------------                                                                                              
                        cmpt = cmpt + 1
                        
                for j in range(len(halocentergn[nod])):
                    if halocentergn[nod][j][-1] != -1:
                        
                        cmpt = cmpt + 1
                        #right cell-----------------------------------                                                                                              
                        cmpt = cmpt + 1
                    
                for j in range(periodicnid[nod][-1]):
                    cmpt = cmpt + 1
                    #right cell-----------------------------------                                                                                              
                    cmpt = cmpt + 1
                
                for j in range(halonid[nod][-1]):
                   
                    cmpt = cmpt + 1
                    #right cell-----------------------------------                                                                                              
                    cmpt = cmpt + 1
        
        cmpt = cmpt + 1
        # right cell------------------------------------------------------
        cmpt = cmpt + 1
        cmpt = cmpt + 1
            
    # elif namef[i] == 10:
    for i in halofaces:
        cmpt = cmpt + 1
        
        cmpt = cmpt + 1
        cmpt = cmpt + 1
        
        for nod in nodeidf[i][:nodeidf[i][-1]]:
            if search_element(BCdirichlet, oldnamen[nod]) == 0:  
                for j in range(cellnid[nod][-1]):
                    cmpt = cmpt + 1
                    
                for j in range(len(centergn[nod])):
                    if centergn[nod][j][-1] != -1:
                        cmpt = cmpt + 1
                        
                for j in range(len(halocentergn[nod])):
                    if halocentergn[nod][j][-1] != -1:
                        cmpt = cmpt + 1

                for j in range(halonid[nod][-1]):
                    cmpt = cmpt + 1
                
    for i in dirichletfaces:
        cmpt = cmpt + 1
        cmpt = cmpt + 1
        
    return cmpt



In [7]:
#? using numba jit -> backend.py
#! Using parallel=True yields poor results.
import numba

numba_compute_2dmatrix_size = numba.jit(compute_2dmatrix_size, nopython=True, fastmath=True, parallel=False, cache=True)
def cpu_compute_2dmatrix_size():
  return numba_compute_2dmatrix_size(
    host_nodeidf,
    host_halofid,
    host_cellnid,
    host_halonid,
    host_centergn,
    host_halocentergn,
    host_haloghostcenter,
    host_oldnamen,
    host_BCdirichlet,
    host_matrixinnerfaces,
    host_halofaces,
    host_dirichletfaces,
  )


In [66]:
#time taken by cpu_compute_2dmatrix_size

initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, domain.cells.center, 2.0)

%timeit cpu_compute_2dmatrix_size()

10.9 ms ± 83.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
@cuda.jit(device=True)
def search_element(a:'int32[:]', target_value:'int32'):
    for i in range(a.shape[0]):
        if a[i] == target_value:
            return True
    return False

@cuda.jit()
def kernel_host_matrixinnerfaces(nodeidf:'int32[:,:]', halofid:'int32[:]', cellnid:'int32[:,:]',  halonid:'int32[:,:]', periodicnid:'int32[:,:]', 
                        centergn:'float[:,:,:]', halocentergn:'float[:,:,:]', oldnamen:'uint32[:]', BCdirichlet:'uint32[:]', 
                        matrixinnerfaces:'uint32[:]', halofaces:'uint32[:]', 
                        dirichletfaces:'uint32[:]', cmpt):                                                                                                                                                                       
    
    idx = cuda.grid(1)

    if idx < matrixinnerfaces.shape[0]:
        i = matrixinnerfaces[idx]
        
        cuda.atomic.add(cmpt, 0, 1)
        
        for nod in nodeidf[i][:nodeidf[i][-1]]:
            if search_element(BCdirichlet, oldnamen[nod]) == False:# and search_element(BCneumannNH, oldnamen[nod]) == False:
            # if vertexn[nod][3] not in BCdirichlet:
                for j in range(cellnid[nod][-1]):
                    
                    #cuda.atomic.add(cmpt, 0, 1)
                    #right cell-----------------------------------                                                                                              
                    cuda.atomic.add(cmpt, 0, 2)
                
                for j in range(len(centergn[nod])):
                    if centergn[nod][j][-1] != -1:
                       
                        #cuda.atomic.add(cmpt, 0, 1)
                        #right cell-----------------------------------                                                                                              
                        cuda.atomic.add(cmpt, 0, 2)
                        
                for j in range(len(halocentergn[nod])):
                    if halocentergn[nod][j][-1] != -1:
                        
                        #cuda.atomic.add(cmpt, 0, 1)
                        #right cell-----------------------------------                                                                                              
                        cuda.atomic.add(cmpt, 0, 2)
                    
                for j in range(periodicnid[nod][-1]):
                    #cuda.atomic.add(cmpt, 0, 1)
                    #right cell-----------------------------------                                                                                              
                    cuda.atomic.add(cmpt, 0, 2)
                
                for j in range(halonid[nod][-1]):
                   
                    #cuda.atomic.add(cmpt, 0, 1)
                    #right cell-----------------------------------                                                                                              
                    cuda.atomic.add(cmpt, 0, 2)
        
        #cuda.atomic.add(cmpt, 0, 1)
        # right cell------------------------------------------------------
        #cuda.atomic.add(cmpt, 0, 1)
        cuda.atomic.add(cmpt, 0, 3)
            
    # elif namef[i] == 10:
    if idx < halofaces.shape[0]:
        i = halofaces[idx]
        #cuda.atomic.add(cmpt, 0, 1)
        
        #cuda.atomic.add(cmpt, 0, 1)
        cuda.atomic.add(cmpt, 0, 3)
        
        for nod in nodeidf[i][:nodeidf[i][-1]]:
            if search_element(BCdirichlet, oldnamen[nod]) == False:  
                for j in range(cellnid[nod][-1]):
                    cuda.atomic.add(cmpt, 0, 1)
                    
                for j in range(len(centergn[nod])):
                    if centergn[nod][j][-1] != -1:
                        cuda.atomic.add(cmpt, 0, 1)
                        
                for j in range(len(halocentergn[nod])):
                    if halocentergn[nod][j][-1] != -1:
                        cuda.atomic.add(cmpt, 0, 1)

                for j in range(halonid[nod][-1]):
                    cuda.atomic.add(cmpt, 0, 1)
                
    if i < dirichletfaces.shape[0]:
        #i = dirichletfaces[idx]
        #cuda.atomic.add(cmpt, 0, 1)
        cuda.atomic.add(cmpt, 0, 2)
        




In [7]:
import numba

d_cmpt = cuda.to_device(np.array([0]).astype(int))

nb_element = max(host_matrixinnerfaces.shape[0], host_halofaces.shape[0] ,host_dirichletfaces.shape[0])
nb_threads = 32
nb_blocks = nb_element // nb_threads + 1
print(nb_blocks)

@cuda.jit
def kernel_zero(cmpt):
  cmpt[0] = 0

def cuda_compute_2dmatrix_size():
  kernel_zero[1, 1](d_cmpt)
  cuda.synchronize()
  kernel_host_matrixinnerfaces[nb_blocks, nb_threads](
    d_nodeidf,
    d_halofid,
    d_cellnid,
    d_halonid,
    d_centergn,
    d_halocentergn,
    d_haloghostcenter,
    d_oldnamen,
    d_BCdirichlet,
    d_matrixinnerfaces,
    d_halofaces,
    d_dirichletfaces,
    d_cmpt,
  )
  cuda.synchronize()

cuda_compute_2dmatrix_size()
test_time(70, cuda_compute_2dmatrix_size)

4949




1.36585 ms


In [65]:
# verify

cpu_result = cpu_compute_2dmatrix_size()
gpu_result = d_cmpt.copy_to_host()[0]
print(result, gpu_result)

4496676 4496676
