<a href="https://colab.research.google.com/github/hassanfv/hfv_GPU/blob/main/smoothing_h_on_gpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [284]:
%%writefile testx.cu
#include <iostream>
#include <ctime>
#include <fstream>
using namespace std;

const int N = 50000;
const int Nngb = 64;
const int Ndown = Nngb - 5;
const int Nup = Nngb + 5;
const float coeff = 0.001;

__global__ void smoothing_h(float *x, float *y, float *z, float *hres, float *hprevious){
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  
  if(i < N){

    float h_new = 2.1f * hprevious[i];
    float h_tmp = h_new;
    int N_iter = 0;
    int k = 0;

    float dx, dy, dz;
    while((k < Ndown) || (k > Nup)){

      k = 0;

      for(int j = 0; j < N; j++){
        dx = x[j] - x[i];
        dy = y[j] - y[i];
        dz = z[j] - z[i];
        float rr = sqrt(dx*dx + dy*dy + dz*dz);

        if(rr <= h_new){
          k++;
        }
      }

      if(k < Ndown){
        h_new = h_new + coeff * 2.0f * hprevious[i];
      }

      if(k > Nup){
        h_new = h_new - coeff * 2.0f * hprevious[i];
      }

      if(h_new > h_tmp){
        h_tmp = h_new;
      }

      N_iter++;
      if(N_iter > 100){
        h_new = h_tmp;
        break;
      }
    }
    hres[i] = 0.5 * h_new;
    }
  }


int main(){

  // Reading the file containing x, y, z, and h.
  ifstream infile("data.csv");
  float xt, yt, zt, ht;
  
  float **data = new float*[N];
  for(int i = 0; i < N; i++){
    data[i] = new float[4];
  }

  for(int i = 0; i < N; i++){
    data[i][0] = 0.0f;
    data[i][1] = 0.0f;
    data[i][2] = 0.0f;
    data[i][3] = 0.0f;
  }

  if(infile.is_open()){
    for(int i = 0; i < N; i++){
      infile >> xt >> yt >> zt >> ht;
      data[i][0] = xt;
      data[i][1] = yt;
      data[i][2] = zt;
      data[i][3] = ht;
    }
  }


  // creating x, y, z arrays in Shared Memorty containing random values between 0 and 1.0
  float *x, *y, *z;
  cudaMallocManaged(&x, N * sizeof(float));
  cudaMallocManaged(&y, N * sizeof(float));
  cudaMallocManaged(&z, N * sizeof(float));

  srand(time(NULL));

  for(int i = 0; i < N; i++){
    x[i] = data[i][0];
    y[i] = data[i][1];
    z[i] = data[i][2];
  }

  float *hres, *hprevious;
  cudaMallocManaged(&hres, N * sizeof(float));
  cudaMallocManaged(&hprevious, N * sizeof(float));

  // Initialize hres on the Host
  for(int i = 0; i < N; i++){
    hres[i] = 10.1f; // 1000.0 is just a place holder!
    hprevious[i] = data[i][3];
  }

  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  smoothing_h<<<gridSize, blockSize>>>(x, y, z, hres, hprevious);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();


  // visual inspection
  for(int i = 0; i < 10; i++){
    //cout << data[i][0] << ' ' << data[i][1] << ' ' << data[i][2] << endl;
    cout << hprevious[i] << "  " << hres[i] << endl;
  }

  // Output to a file
  ofstream outfile("data_from_cpp.csv");
  if(outfile.is_open()){
    for(int i = 0; i < N; i++){
      outfile << x[i] << "," << y[i] << "," << z[i] << "," << hprevious[i] << "," << hres[i] << endl;
    }
  }else cout << "Unable to open file !";

  // Free memory
  cudaFree(x);
  cudaFree(y);
  cudaFree(z);
  cudaFree(hres);
  cudaFree(hprevious);

  delete[] data;

}

Overwriting testx.cu


In [285]:
%%shell
nvcc testx.cu -o testx



In [286]:
%%shell
./testx

0.0596094  0.0621726
0.0269534  0.0269534
0.0637682  0.0654261
0.167706  0.175588
0.0249028  0.0253261
0.0186392  0.0188815
0.0118953  0.0118953
0.0225291  0.0225516
0.0684425  0.0709065
0.0210561  0.0210561




In [287]:
%%shell
nvprof ./testx

==5881== NVPROF is profiling process 5881, command: ./testx
0.0596094  0.0621726
0.0269534  0.0269534
0.0637682  0.0654261
0.167706  0.175588
0.0249028  0.0253261
0.0186392  0.0188815
0.0118953  0.0118953
0.0225291  0.0225516
0.0684425  0.0709065
0.0210561  0.0210561
==5881== Profiling application: ./testx
==5881== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  892.78ms         1  892.78ms  892.78ms  892.78ms  smoothing_h(float*, float*, float*, float*, float*)
      API calls:   79.63%  892.80ms         1  892.80ms  892.80ms  892.80ms  cudaDeviceSynchronize
                   20.30%  227.62ms         5  45.524ms  4.7370us  227.57ms  cudaMallocManaged
                    0.03%  344.84us         1  344.84us  344.84us  344.84us  cuDeviceTotalMem
                    0.02%  213.10us         5  42.619us  6.2910us  135.56us  cudaFree
                    0.01%  158.97us       101  1.5730us     123ns  67.762us  cuD



In [347]:
%%writefile testx.cu
#include <iostream>
#include <ctime>
#include <fstream>
using namespace std;

const int N = 70000;
const int Nngb = 64;
const int Ndown = Nngb - 5;
const int Nup = Nngb + 5;
const float coeff = 0.001;

__global__ void smoothing_h(float *x, float *y, float *z, float *hres, float *hprevious){

  int i = threadIdx.x + blockIdx.x * blockDim.x;

  if(i < N){

    float h_new = 2.1f * hprevious[i]; // Change it to 2.0 in REAL App !!!!!!!!
    float h_tmp = h_new;
    int N_iter = 0;
    int k = 0;

    float dx, dy, dz;
    while((k < Ndown) || (k > Nup)){

      k = 0;

      for(int j = 0; j < N; j++){
        dx = x[j] - x[i];
        dy = y[j] - y[i];
        dz = z[j] - z[i];
        float rr = sqrt(dx*dx + dy*dy + dz*dz);

        if(rr <= h_new){
          k++;
        }
      }

      if(k < Ndown){
        h_new = h_new + coeff * 2.0f * hprevious[i];
      }

      if(k > Nup){
        h_new = h_new - coeff * 2.0f * hprevious[i];
      }

      if(h_new > h_tmp){
        h_tmp = h_new;
      }

      N_iter++;
      if(N_iter > 100){
        h_new = h_tmp;
        break;
      }
    }
    hres[i] = 0.5 * h_new;
    }
  }


int main(void){

  // Reading the file containing x, y, z, and h.
  ifstream infile("data.csv");
  float xt, yt, zt, ht;
  
  float **data = new float*[N];
  for(int i = 0; i < N; i++){
    data[i] = new float[4];
  }

  for(int i = 0; i < N; i++){
    data[i][0] = 0.0f;
    data[i][1] = 0.0f;
    data[i][2] = 0.0f;
    data[i][3] = 0.0f;
  }

  if(infile.is_open()){
    for(int i = 0; i < N; i++){
      infile >> xt >> yt >> zt >> ht;
      data[i][0] = xt;
      data[i][1] = yt;
      data[i][2] = zt;
      data[i][3] = ht;
    }
  }


  // creating x, y, z arrays in Shared Memorty containing random values between 0 and 1.0
  float *x, *d_x, *y, *d_y, *z, *d_z;
  x = new float[N];
  y = new float[N];
  z = new float[N];

  float *hres, *d_hres, *hprevious, *d_hprevious;
  hres = new float[N];
  hprevious = new float[N];

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));
  cudaMalloc(&d_z, N*sizeof(float));

  cudaMalloc(&d_hres, N*sizeof(float));
  cudaMalloc(&d_hprevious, N*sizeof(float));

  // Initialize x, y, and z on the Host.
  for(int i = 0; i < N; i++){
    x[i] = data[i][0];
    y[i] = data[i][1];
    z[i] = data[i][2];
  }

  // Initialize hres and hprevious on the Host
  for(int i = 0; i < N; i++){
    hres[i] = 10.1f; // 100.0 is just a place holder!
    hprevious[i] = data[i][3];
  }

  // Copy from Host to Device.
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_z, z, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_hres, hres, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_hprevious, hprevious, N*sizeof(float), cudaMemcpyHostToDevice);


  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  smoothing_h<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_hres, d_hprevious);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();


  // Output to a file
  ofstream outfile("data_from_cpp.csv");
  if(outfile.is_open()){
    for(int i = 0; i < N; i++){
      outfile << x[i] << "," << y[i] << "," << z[i] << "," << hprevious[i] << "," << hres[i] << endl;
    }
  }else cout << "Unable to open file !";

  cudaMemcpy(hres, d_hres, N*sizeof(float), cudaMemcpyDeviceToHost);

  // visual inspection
  for(int i = 0; i < 10; i++){
    //cout << data[i][0] << ' ' << data[i][1] << ' ' << data[i][2] << endl;
    cout << hprevious[i] << "  " << hres[i] << endl;
  }

  // Free memory
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);
  cudaFree(d_hres);
  cudaFree(d_hprevious);

  delete[] data;
  delete[] x;
  delete[] y;
  delete[] z;
  delete[] hres;
  delete[] hprevious;

}

Overwriting testx.cu


In [348]:
%%shell
nvcc testx.cu -o testx



In [349]:
%%shell
./testx

0.0531791  0.0531791
0.0632421  0.0632421
0.0568285  0.0568285
0.0792391  0.0792391
0.0511843  0.0511843
0.0617926  0.0617926
0.0581889  0.0581889
0.0575924  0.0575924
0.0602205  0.0602205
0.0623432  0.0623432




In [350]:
%%shell
nvprof ./testx

==7020== NVPROF is profiling process 7020, command: ./testx
0.0531791  0.0531791
0.0632421  0.0632421
0.0568285  0.0568285
0.0792391  0.0792391
0.0511843  0.0511843
0.0617926  0.0617926
0.0581889  0.0581889
0.0575924  0.0575924
0.0602205  0.0602205
0.0623432  0.0623432
==7020== Profiling application: ./testx
==7020== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.74%  59.057ms         1  59.057ms  59.057ms  59.057ms  smoothing_h(float*, float*, float*, float*, float*)
                    0.22%  130.97us         5  26.194us  25.568us  26.815us  [CUDA memcpy HtoD]
                    0.04%  23.584us         1  23.584us  23.584us  23.584us  [CUDA memcpy DtoH]
      API calls:   81.22%  262.87ms         5  52.574ms  3.3460us  262.85ms  cudaMalloc
                   18.25%  59.069ms         1  59.069ms  59.069ms  59.069ms  cudaDeviceSynchronize
                    0.19%  611.77us         6  101.96us  91.478us  138.72



In [329]:
%%shell
nvidia-smi

Thu Nov 17 21:52:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    14W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

