In [1]:
from mpi4py import MPI
from manapy.ast import Variable
from manapy.base.base import Struct
from manapy.ddm import Domain
from manapy.partitions import MeshPartition
from manapy.solvers.advec.tools_utils import initialisation_gaussian_2d
import numpy as np
import time
from numba import cuda
from manapy.solvers.advec import AdvectionSolver

import matplotlib.pyplot as plt

from timeit import default_timer as timer

###############
# test_time
###############

def test_time(iter, fun):
  #fun()
  start_time = timer()
  for _ in range(iter):
    fun()
  end_time = timer()
  elapsed_time = (end_time - start_time) / iter
  print(f"{elapsed_time * 1000:.5f} ms")
  #print(f"{elapsed_time * 1000000:.5f} micros")

###############
# init
###############
def init(dim, mesh_path):
  running_conf = Struct(backend="numba", signature=True, cache=True, float_precision="single")
  MeshPartition(mesh_path, dim=dim, conf=running_conf, periodic=[0,0,0])

  domain = Domain(dim=dim, conf=running_conf)
  ne = Variable(domain=domain)
  u  = Variable(domain=domain)
  v  = Variable(domain=domain)
  w  = Variable(domain=domain)
  
  P = Variable(domain=domain)
  Pinit = 2.0
  cells = domain.cells
  initialisation_gaussian_2d(ne.cell, u.cell, v.cell, P.cell, cells.center, Pinit)

  u.face[:] = 2.
  v.face[:] = 2.
  w.face[:] = 1.
  
  u.interpolate_facetocell()
  v.interpolate_facetocell()
  w.interpolate_facetocell()

  
  return (domain, ne, u, v, w, P, running_conf)

In [2]:
dim = 2
#mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/manapy/mesh/2D/carre.msh"
#mesh_file = "/home/ayoub.hamou/mesh/square.msh"
mesh_file = "/home/aben-ham/Desktop/work/stage/my_manapy/gpu_accelerator/functions/square.msh"
domain, ne, u, v, w, P, conf = init(dim=dim, mesh_path=mesh_file)

S = AdvectionSolver(ne, vel=(u, v), conf=conf)

Reading gmsh file ...
Saving partition files ...
Number of Cells: 258030
Number of Vertices: 129684
Local domain contruction ...


In [3]:
from numba import cuda

from manapy.gpu_fun.utils.utils_kernels import (
  VarClass
)



host_args = []
d_args = []

def create_var(v):
  if isinstance(v, (int, float, np.int32, np.float32, np.float64)):
    res = (v, v)
  else:
    res = (v, cuda.to_device(v))
  host_args.append(res[0])
  d_args.append(res[1])
  return res


S.u.cell[:] = 1
S.v.cell[:] = 1
S.w.cell[:] = 1
S.domain.faces.normal[:] = 1



host_u, d_u = create_var(S.u.cell)
host_v, d_v = create_var(S.v.cell)
host_w, d_w = create_var(S.w.cell)
host_cfl, d_cfl = create_var(np.float32(S.cfl))
host_normal, d_normal = create_var(S.domain.faces.normal)
host_mesure, d_mesure = create_var(S.domain.faces.mesure)
host_volume, d_volume = create_var(S.domain.cells.volume)
host_faceid, d_faceid = create_var(S.domain.cells.faceid)
host_dim, d_dim = create_var(S.dim)


hd_args = [VarClass(a) for a in host_args]



### Cpu Function

In [4]:
#The original function from functions2d.py

from numpy import  int32, float32, float16, uint32
import numpy as np

def time_step(
    u:'float[:]',
    v:'float[:]',
    w:'float[:]',
    cfl:'float',
    normal:'float[:,:]',
    mesure:'float[:]',
    volume:'float[:]',
    faceid:'int32[:,:]',
    dim:'int32'
):
   
    nbelement =  len(faceid)
    norm = np.zeros(3)
    dt = 1e6
    for i in range(nbelement):
        lam = 0.0
       
        for j in range(faceid[i][-1]):
            norm[:] = normal[faceid[i][j]][:]
            lam_convect = np.fabs(u[i]*norm[0] + v[i]*norm[1] + w[i]*norm[2])
            lam += lam_convect
        # if i < 10:
        #     print(lam, u[i],norm[0] , v[i],norm[1] , w[i],norm[2])
        dt  = min(dt, cfl * volume[i]/lam)
     
    return dt

import numba

numba_time_step = domain.backend.compile(
  time_step, signature=True, target_device='cpu', echo=True)

print(numba_time_step(*host_args))
#%timeit numba_time_step(*host_args)

compile time_step to cpu => signature=(float32[:], float32[:], float32[:], float32, float32[:,:], float32[:], float32[:], int32[:,:], int32)
1.0363966869893677e-07


### Cuda Function

In [7]:
import numpy as np
from numba import cuda
from manapy.gpu_fun.utils.utils_kernels import (
  compile_kernel,
  get_gpu_prams,
  device_compute_upwind_flux,
  kernel_assign,
  to_device,
  debug,
  GPU_Backend
)

def get_kernel_time_step():
  
  d_shared_dt = cuda.device_array(shape=(1), dtype=GPU_Backend.float_precision)

  def kernel_time_step(
    u:'float[:]', 
    v:'float[:]', 
    w:'float[:]', 
    cfl:'float', 
    normal:'float[:,:]',
    mesure:'float[:]', 
    volume:'float[:]', 
    faceid:'int32[:,:]', 
    dim:'int32', 
    shared_dt : 'float[:]'
    ):

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    #? dt = 1e6

    for i in range(start, len(faceid), stride):
      lam = 0.0
      
      for j in range(faceid[i][-1]):
        norm = normal[faceid[i][j]]
        lam_convect = abs(u[i]*norm[0] + v[i]*norm[1] + w[i]*norm[2])
        lam += lam_convect
      
      # if i < 10:
      #   print(lam, u[i],norm[0] , v[i],norm[1] , w[i],norm[2])
      cuda.atomic.min(shared_dt, 0, cfl * volume[i] / lam)
      #? dt  = min(dt, cfl * volume[i]/lam)

  kernel_time_step = GPU_Backend.compile_kernel(kernel_time_step)
  kernel_assign_float = GPU_Backend.compile_kernel(kernel_assign)

  def result(*args):
    VarClass.debug(kernel_time_step, args)
    args = [to_device(arg) for arg in args]
    kernel_assign_float[1, 1](d_shared_dt, 1e6) #rhs
    cuda.synchronize()
    
    # shared_dt
    size = len(args[7])
    nb_blocks, nb_threads = GPU_Backend.get_gpu_prams(size)
    kernel_time_step[nb_blocks, nb_threads](*args, d_shared_dt)
    return d_shared_dt.copy_to_host()[0]
    #cuda.synchronize()

  return result 

kernel_time_step = get_kernel_time_step()
print(kernel_time_step(*hd_args))

#%timeit kernel_time_step(*d_args)

compile kernel_time_step to cuda => signature=void(float32[:], float32[:], float32[:], float32, float32[:,:], float32[:], float32[:], int32[:,:], int32, float32[:])
compile kernel_assign to cuda => signature=void(float32[:], float32)
CUDADispatcher(<function get_kernel_time_step.<locals>.kernel_time_step at 0x7a8c80aa9940>) is called
VarClass<float32, (258030,)>
VarClass<float32, (258030,)>
VarClass<float32, (258030,)>
<class 'numpy.float32'>
VarClass<float32, (387713, 3)>
VarClass<float32, (387713,)>
VarClass<float32, (258030,)>
VarClass<int32, (258030, 4)>
<class 'numpy.int32'>
1.0363967e-07


In [6]:
#! precision float

cpu_res = numba_time_step(*host_args)
gpu_res = kernel_time_step(*d_args)
print(cpu_res, gpu_res, abs(cpu_res - gpu_res))

1.0363966869893677e-07 1.0363967e-07 2.3684757902786306e-15
