<a href="https://colab.research.google.com/github/hfathie/qso/blob/master/getDensity_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
%%writefile test.cu
#include <iostream>
#include <fstream>
#include <cmath>
#include <string>
#include <vector>
#include <sstream>
using namespace std;

const int N = 131504;
const float my_pi = 3.141592f;

__global__ void getDensity(float *x, float *y, float *z, float *mass,
                           float *rho, float *h){

  int i = threadIdx.x + blockIdx.x +blockDim.x;

  if(i < N){

    float dx, dy, dz, rr, hij, sig, q, hij3;
    float WIij;
    float ss = 0.0f;

    for(int j = 0; j < N; j++){
      dx = x[i] - x[j];
      dy = y[i] - y[j];
      dz = z[i] - z[j];

      rr = sqrt(dx*dx + dy*dy + dz*dz);
      hij = 0.5f * (h[i] + h[j]);

      if(rr <= 2.0f * hij){

        sig = 1.0 / my_pi;
        q = rr/hij;
        hij3 = hij * hij * hij;
        WIij = 0.0f;

        if(q <= 1.0){
          WIij = sig/hij3 * (1.0f - (3.0f/2.0f)*q*q + (3.0f/4.0f)*q*q*q);
        }

        if((q > 1.0f) && (q <= 2.0)){
          WIij = sig/hij3 * (1.0f/4.0f) * (2.0f - q)*(2.0f - q)*(2.0f - q);
        }

        ss += mass[j] * WIij; 
      }
    }
    rho[i] = ss;
  }

}


int main(){

  // Reading Hydra file.
  string fname = "Hydra_130k.csv";

  vector<vector<string>> content;
  vector<string> row;
  string line, word;
  
  fstream file (fname, ios::in);
  if(file.is_open())
  {
  while(getline(file, line))
  {
  row.clear();
  
  stringstream str(line);
  
  while(getline(str, word, ','))
  row.push_back(word);
  content.push_back(row);
  }
  }
  else
  cout<<"Could not open the file\n";

  float *x,*y,*z, *h, *mass, *rho;
  float *d_x,*d_y,*d_z, *d_h, *d_mass, *d_rho;

  x = new float[N];
  y = new float[N];
  z = new float[N];

  rho = new float[N];
  h = new float[N];
  mass = new float[N];

  // 0  1  2  3   4   5    6   7  8  9  10
  // x, y, z, vx, vy, vz, rho, P, c, h, m.

  for(int i=0; i<N; i++){

    x[i] = stof(content[i][0]);
    y[i] = stof(content[i][1]);
    z[i] = stof(content[i][2]);

    rho[i] = stof(content[i][6]);
    h[i] = stof(content[i][9]);
    mass[i] = stof(content[i][10]);
  }

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));
  cudaMalloc(&d_z, N*sizeof(float));

  cudaMalloc(&d_rho, N*sizeof(float));
  cudaMalloc(&d_h, N*sizeof(float));
  cudaMalloc(&d_mass, N*sizeof(float));

  // Copy from Host to Device.
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_z, z, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaMemcpy(d_rho, rho, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_h, h, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_mass, mass, N*sizeof(float), cudaMemcpyHostToDevice);

  // Launching the kernel on GPU
  int blockSize = 256; // number of threads in a block
  int gridSize = (N + blockSize - 1) / blockSize; // Number of blocks in a grid

  getDensity<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_mass, d_h, d_rho);

  // Wait for the GPU to finish before accessing the Host
  cudaDeviceSynchronize();


  // visual inspection
  for(int i = N-10; i < N; i++){
    cout << rho[i] << endl;
  }

  // Free memory.
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);

  cudaFree(d_rho);
  cudaFree(d_h);
  cudaFree(d_mass);

  delete[] x;
  delete[] y;
  delete[] z;

  delete[] rho;
  delete[] h;
  delete[] mass;

}

Overwriting test.cu


In [18]:
%%shell
nvcc test.cu -o test



In [19]:
%%shell
./test

0.415199
0.316639
0.221243
0.323365
0.205203
0.143352
0.213201
0.596869
0.17019
0.173833




In [20]:
%%shell
nvprof ./test

==420== NVPROF is profiling process 420, command: ./test
0.415199
0.316639
0.221243
0.323365
0.205203
0.143352
0.213201
0.596869
0.17019
0.173833
==420== Profiling application: ./test
==420== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.95%  507.76ms         1  507.76ms  507.76ms  507.76ms  getDensity(float*, float*, float*, float*, float*, float*)
                    0.05%  272.79us         6  45.465us  45.119us  45.951us  [CUDA memcpy HtoD]
      API calls:   60.97%  507.81ms         1  507.81ms  507.81ms  507.81ms  cudaDeviceSynchronize
                   38.81%  323.23ms         6  53.872ms  2.5930us  323.12ms  cudaMalloc
                    0.12%  1.0182ms         6  169.70us  138.81us  188.33us  cudaMemcpy
                    0.04%  359.69us         1  359.69us  359.69us  359.69us  cuDeviceTotalMem
                    0.03%  258.18us         6  43.030us  2.8320us  146.32us  cudaFree
                    0

