<a href="https://colab.research.google.com/github/hfathie/hfvSPH_on_GPU/blob/main/smoothing_h.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
%%writefile test.cu
#include <iostream> // iostream, fstream, cmath, string, vector, sstream.
#include <fstream>
#include <cmath>
#include <string>
#include <vector>
#include <sstream>
#include "myCppSPHLibs.h"
using namespace std;

const int N = 131504;
const int Nngb = 64;
const int Ndown = Nngb - 5;
const int Nup = Nngb + 5;
const float coeff = 0.001;


int main(){

  // Reading Hydra file.
  string fname = "Hydra_130k.csv";

  vector<vector<string>> content;
  vector<string> row;
  string line, word;
  
  fstream file (fname, ios::in);
  if(file.is_open())
  {
  while(getline(file, line))
  {
  row.clear();
  
  stringstream str(line);
  
  while(getline(str, word, ','))
  row.push_back(word);
  content.push_back(row);
  }
  }
  else
  cout<<"Could not open the file\n";

  // creating x, y, z arrays in Shared Memorty containing random values between 0 and 1.0
  float *x, *d_x, *y, *d_y, *z, *d_z;
  x = new float[N];
  y = new float[N];
  z = new float[N];

  float *hres, *d_hres, *hprevious, *d_hprevious;
  hres = new float[N];
  hprevious = new float[N];

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));
  cudaMalloc(&d_z, N*sizeof(float));

  cudaMalloc(&d_hres, N*sizeof(float));
  cudaMalloc(&d_hprevious, N*sizeof(float));

  // 0  1  2  3   4   5    6   7  8  9  10
  // x, y, z, vx, vy, vz, rho, P, c, h, m.

  // Initialize x, y, and z on the Host.
  for(int i = 0; i < N; i++){
    x[i] = stof(content[i][0]);
    y[i] = stof(content[i][1]);
    z[i] = stof(content[i][2]);
  }

  // Initialize hres and hprevious on the Host
  for(int i = 0; i < N; i++){
    hres[i] = 10.1f; // 100.0 is just a place holder!
    hprevious[i] = stof(content[i][9]);;
  }

  // Copy from Host to Device.
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_z, z, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_hres, hres, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_hprevious, hprevious, N*sizeof(float), cudaMemcpyHostToDevice);


  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  smoothing_h<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_hres, d_hprevious,
                                       N, Ndown, Nup, coeff);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();

  cudaMemcpy(hres, d_hres, N*sizeof(float), cudaMemcpyDeviceToHost);

  // visual inspection
  for(int i = N-10; i < N; i++){
    cout << hprevious[i] << "  " << hres[i] << endl;
  }

  // Free memory
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);
  cudaFree(d_hres);
  cudaFree(d_hprevious);

  delete[] x;
  delete[] y;
  delete[] z;
  delete[] hres;
  delete[] hprevious;

}

Overwriting test.cu


In [15]:
%%shell
nvcc test.cu -o test



In [16]:
%%shell
./test

0.0491951  0.0491951
0.0541674  0.0541674
0.0506397  0.0506397
0.0476966  0.0476966
0.0563585  0.0563585
0.0607407  0.0607407
0.0579519  0.0579519
0.0439055  0.0439055
0.0553372  0.0553372
0.0540047  0.0540047




In [17]:
%%shell
nvprof ./test

==395== NVPROF is profiling process 395, command: ./test
0.0491951  0.0491951
0.0541674  0.0541674
0.0506397  0.0506397
0.0476966  0.0476966
0.0563585  0.0563585
0.0607407  0.0607407
0.0579519  0.0579519
0.0439055  0.0439055
0.0553372  0.0553372
0.0540047  0.0540047
==395== Profiling application: ./test
==395== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.87%  205.60ms         1  205.60ms  205.60ms  205.60ms  smoothing_h(float*, float*, float*, float*, float*, int, int, int, float)
                    0.11%  226.68us         5  45.336us  44.831us  45.920us  [CUDA memcpy HtoD]
                    0.02%  42.015us         1  42.015us  42.015us  42.015us  [CUDA memcpy DtoH]
      API calls:   58.69%  294.83ms         5  58.966ms  3.0670us  294.73ms  cudaMalloc
                   40.93%  205.63ms         1  205.63ms  205.63ms  205.63ms  cudaDeviceSynchronize
                    0.20%  1.0046ms         6  167.43us  

