<a href="https://colab.research.google.com/github/hfathie/hfvSPH_on_GPU/blob/main/acc_g.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
%%writefile test.cu
#include <iostream>
#include <fstream>
#include <cmath>
#include <string>
#include <vector>
#include <sstream>
#include "myCppSPHLibs.h"
using namespace std;

const int N = 131504;
const float G = 1.0f;


int main(){

  // Reading Hydra file.
  string fname = "Hydra_130k.csv";

  vector<vector<string>> content;
  vector<string> row;
  string line, word;
  
  fstream file (fname, ios::in);
  if(file.is_open())
  {
  while(getline(file, line))
  {
  row.clear();
  
  stringstream str(line);
  
  while(getline(str, word, ','))
  row.push_back(word);
  content.push_back(row);
  }
  }
  else
  cout<<"Could not open the file\n";
 
  // creating x, y, z arrays in Shared Memorty containing random values between 0 and 1.0
  float *x, *d_x, *y, *d_y, *z, *d_z, *eps, *d_eps, *accx, *accy, *accz, *d_accx, *d_accy, *d_accz, *mass, *d_mass;
  x = new float[N];
  y = new float[N];
  z = new float[N];

  accx = new float[N];
  accy = new float[N];
  accz = new float[N];

  eps = new float[N];
  mass = new float[N];

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));
  cudaMalloc(&d_z, N*sizeof(float));

  cudaMalloc(&d_accx, N*sizeof(float));
  cudaMalloc(&d_accy, N*sizeof(float));
  cudaMalloc(&d_accz, N*sizeof(float));

  cudaMalloc(&d_eps, N*sizeof(float));
  cudaMalloc(&d_mass, N*sizeof(float));

  // Initialize x, y, and z on the Host.
  // 0  1  2  3   4   5    6   7  8  9  10
  // x, y, z, vx, vy, vz, rho, P, c, h, m.

  for(int i=0; i<N; i++){

    x[i] = stof(content[i][0]);
    y[i] = stof(content[i][1]);
    z[i] = stof(content[i][2]);

    mass[i] = stof(content[i][10]);
    eps[i] = 0.0001f;
  }

  // Copy from Host to Device.
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_z, z, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_accx, accx, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_accy, accy, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_accz, accz, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_eps, eps, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_mass, mass, N*sizeof(float), cudaMemcpyHostToDevice);


  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  acc_g<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_eps, d_accx, d_accy, d_accz,
                                 d_mass, G, N);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();

  // Copy from Device to Host.
  cudaMemcpy(accx, d_accx, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(accy, d_accy, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(accz, d_accz, N*sizeof(float), cudaMemcpyDeviceToHost);

  // visual inspection
  for(int i = N-10; i < N; i++){
    cout << accx[i] << ' ' << accy[i] << ' ' << accz[i] << endl;
  }

  // Free memory
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);

  cudaFree(d_accx);
  cudaFree(d_accy);
  cudaFree(d_accz);

  cudaFree(d_eps);
  cudaFree(d_mass);

  delete[] x;
  delete[] y;
  delete[] z;

  delete[] accx;
  delete[] accy;
  delete[] accz;

  delete[] eps;
  delete[] mass;

}

Overwriting test.cu


In [23]:
%%shell
nvcc test.cu -o test



In [24]:
%%shell
./test

0.407932 -0.230976 1.40708
0.560065 0.433586 0.464642
0.255245 0.402763 -0.770751
-0.56541 0.0508879 0.818156
-0.319113 0.37269 0.960332
-0.439767 0.293 -0.760744
0.165745 0.907049 -0.326253
-0.97706 -0.616284 0.133909
-0.106655 -0.217472 -1.06187
-0.345128 1.01841 -0.0682514




In [25]:
%%shell
nvprof ./test

==573== NVPROF is profiling process 573, command: ./test
0.407932 -0.230976 1.40708
0.560065 0.433586 0.464642
0.255245 0.402763 -0.770751
-0.56541 0.0508879 0.818156
-0.319113 0.37269 0.960332
-0.439767 0.293 -0.760744
0.165745 0.907049 -0.326253
-0.97706 -0.616284 0.133909
-0.106655 -0.217472 -1.06187
-0.345128 1.01841 -0.0682514
==573== Profiling application: ./test
==573== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.89%  461.63ms         1  461.63ms  461.63ms  461.63ms  acc_g(float*, float*, float*, float*, float*, float*, float*, float*, float, int)
                    0.08%  383.61us         8  47.951us  47.519us  48.383us  [CUDA memcpy HtoD]
                    0.03%  124.00us         3  41.332us  41.215us  41.439us  [CUDA memcpy DtoH]
      API calls:   66.63%  461.67ms         1  461.67ms  461.67ms  461.67ms  cudaDeviceSynchronize
                   32.84%  227.56ms         8  28.444ms  2.6310us  227

