<a href="https://colab.research.google.com/github/hfathie/hfvSPH_on_GPU/blob/main/div_curlV_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
%%writefile test.cu
#include <iostream> // include iostream, fstream, cmath, string, vector, sstream.
#include <fstream>
#include <cmath>
#include <string>
#include <vector>
#include <sstream>
#include "myCppSPHLibs.h"
using namespace std;

const int N = 131504;
const float my_pi = 3.141592f;

int main(){

  // Reading Hydra file.
  string fname = "Hydra_130k.csv";

  vector<vector<string>> content;
  vector<string> row;
  string line, word;
  
  fstream file (fname, ios::in);
  if(file.is_open())
  {
  while(getline(file, line))
  {
  row.clear();
  
  stringstream str(line);
  
  while(getline(str, word, ','))
  row.push_back(word);
  content.push_back(row);
  }
  }
  else
  cout<<"Could not open the file\n";

  float *x,*y,*z, *vx,*vy,*vz, *rho, *P, *c, *h, *mass, *divV, *curlV;
  float *d_x,*d_y,*d_z, *d_vx,*d_vy,*d_vz, *d_rho, *d_P, *d_c, *d_h, *d_mass;
  float *d_divV, *d_curlV;

  x = new float[N];
  y = new float[N];
  z = new float[N];

  vx = new float[N];
  vy = new float[N];
  vz = new float[N];

  rho = new float[N];
  P = new float[N];
  c = new float[N];
  h = new float[N];
  mass = new float[N];
  divV = new float[N];
  curlV = new float[N];

  // 0  1  2  3   4   5    6   7  8  9  10
  // x, y, z, vx, vy, vz, rho, P, c, h, m.

  for(int i=0; i<N; i++){

    x[i] = stof(content[i][0]);
    y[i] = stof(content[i][1]);
    z[i] = stof(content[i][2]);

    vx[i] = stof(content[i][3]);
    vy[i] = stof(content[i][4]);
    vz[i] = stof(content[i][5]);

    rho[i] = stof(content[i][6]);
    P[i] = stof(content[i][7]);
    c[i] = stof(content[i][8]);
    h[i] = stof(content[i][9]);
    mass[i] = stof(content[i][10]);

    divV[i] = 110.0f;
    curlV[i] = 110.0f;
  }

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));
  cudaMalloc(&d_z, N*sizeof(float));

  cudaMalloc(&d_vx, N*sizeof(float));
  cudaMalloc(&d_vy, N*sizeof(float));
  cudaMalloc(&d_vz, N*sizeof(float));

  cudaMalloc(&d_rho, N*sizeof(float));
  cudaMalloc(&d_P, N*sizeof(float));
  cudaMalloc(&d_c, N*sizeof(float));
  cudaMalloc(&d_h, N*sizeof(float));
  cudaMalloc(&d_mass, N*sizeof(float));
  cudaMalloc(&d_divV, N*sizeof(float));
  cudaMalloc(&d_curlV, N*sizeof(float));

  // Copy from Host to Device.
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_z, z, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_vx, vx, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_vy, vy, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_vz, vz, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_rho, rho, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_P, P, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_c, c, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_h, h, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_mass, mass, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_divV, divV, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_curlV, curlV, N*sizeof(float), cudaMemcpyHostToDevice);


  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  div_curlVel<<<gridSize, blockSize>>>(d_divV, d_curlV, d_x, d_y, d_z, d_vx, d_vy, d_vz,
                                       d_rho, d_mass, d_h, my_pi, N);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();

  // Copy from Device to Host.
  cudaMemcpy(divV, d_divV, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(curlV, d_curlV, N*sizeof(float), cudaMemcpyDeviceToHost);

  // visual check.
  for(int i = N-10; i < N; i++){
    cout << divV[i] << " " << curlV[i] << endl;
  }

  // Free memory.
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);

  cudaFree(d_vx);
  cudaFree(d_vy);
  cudaFree(d_vz);

  cudaFree(d_rho);
  cudaFree(d_P);
  cudaFree(d_c);
  cudaFree(d_h);
  cudaFree(d_mass);
  cudaFree(d_divV);
  cudaFree(d_curlV);

  delete[] x;
  delete[] y;
  delete[] z;

  delete[] vx;
  delete[] vy;
  delete[] vz;

  delete[] rho;
  delete[] P;
  delete[] c;
  delete[] h;
  delete[] mass;
  delete[] divV;
  delete[] curlV;

}

Overwriting test.cu


In [10]:
%%shell
nvcc test.cu -o test



In [11]:
%%shell
./test

0.949959 0.127185
0.626051 0.0280715
0.662572 0.274751
0.577422 0.346793
0.716749 0.0292192
0.187944 0.0923823
0.557782 0.14172
1.06012 0.0902458
0.101683 0.0994686
0.0386354 0.0324852




In [12]:
%%shell
nvprof ./test

==476== NVPROF is profiling process 476, command: ./test
0.949959 0.127185
0.626051 0.0280715
0.662572 0.274751
0.577422 0.346793
0.716749 0.0292192
0.187944 0.0923823
0.557782 0.14172
1.06012 0.0902458
0.101683 0.0994686
0.0386354 0.0324852
==476== Profiling application: ./test
==476== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.81%  346.97ms         1  346.97ms  346.97ms  346.97ms  div_curlVel(float*, float*, float*, float*, float*, float*, float*, float*, float*, float*, float*, float, int)
                    0.17%  590.58us        13  45.429us  44.991us  46.911us  [CUDA memcpy HtoD]
                    0.02%  83.486us         2  41.743us  41.727us  41.759us  [CUDA memcpy DtoH]
      API calls:   59.10%  347.01ms         1  347.01ms  347.01ms  347.01ms  cudaDeviceSynchronize
                   40.26%  236.39ms        13  18.184ms  2.4820us  236.00ms  cudaMalloc
                    0.42%  2.4695ms        1

