In [3]:
%%writefile heat_diffusion.cu

Writing heat_diffusion.cu


In [13]:
%%writefile heat_diffusion.cu
#include <cuda_runtime.h>
#include <iostream>

#define N 100
#define T 1000
#define DX 0.1
#define DT 0.01
#define ALPHA 0.1
#define BLOCK_SIZE 16
#define TILE_SIZE (BLOCK_SIZE-2)

#define CHECK_CUDA(cmd) \
do { \
    cudaError_t error = cmd; \
    if (error != cudaSuccess) { \
        std::cerr << "CUDA error: " << cudaGetErrorString(error) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(1); \
    } \
} while(0)

__global__ void heat_diffusion_kernel(const double* u, double* u_new, int n) {
    __shared__ double tile[BLOCK_SIZE][BLOCK_SIZE];

    int gx = blockIdx.x * TILE_SIZE + threadIdx.x - 1;
    int gy = blockIdx.y * TILE_SIZE + threadIdx.y - 1;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    if (gx >= 0 && gx < n && gy >= 0 && gy < n)
        tile[ty][tx] = u[gx*n + gy];
    else
        tile[ty][tx] = 0.0;

    __syncthreads();

    if (tx>0 && tx<BLOCK_SIZE-1 && ty>0 && ty<BLOCK_SIZE-1) {
        if (gx>0 && gx<n-1 && gy>0 && gy<n-1) {
            double d2x = (tile[ty][tx+1] - 2*tile[ty][tx] + tile[ty][tx-1])/(DX*DX);
            double d2y = (tile[ty+1][tx] - 2*tile[ty][tx] + tile[ty-1][tx])/(DX*DX);
            u_new[gx*n + gy] = tile[ty][tx] + ALPHA*DT*(d2x+d2y);
        }
    }
}

int main() {
    double *u, *u_new;
    double *d_u, *d_u_new;

    CHECK_CUDA(cudaMallocHost(&u, N*N*sizeof(double)));
    CHECK_CUDA(cudaMallocHost(&u_new, N*N*sizeof(double)));

    #pragma omp parallel for collapse(2)
    for(int i=0;i<N;i++) {
        for(int j=0;j<N;j++) {
            u[i*N+j] = (i==0 || j==0 || i==N-1 || j==N-1) ? 100.0 : 0.0;
            u_new[i*N+j] = u[i*N+j];
        }
    }

    CHECK_CUDA(cudaMalloc(&d_u, N*N*sizeof(double)));
    CHECK_CUDA(cudaMalloc(&d_u_new, N*N*sizeof(double)));

    cudaStream_t stream;
    CHECK_CUDA(cudaStreamCreate(&stream));
    CHECK_CUDA(cudaMemcpyAsync(d_u, u, N*N*sizeof(double), cudaMemcpyHostToDevice, stream));
    CHECK_CUDA(cudaMemcpyAsync(d_u_new, u_new, N*N*sizeof(double), cudaMemcpyHostToDevice, stream));

    dim3 block(BLOCK_SIZE,BLOCK_SIZE);
    dim3 grid((N+TILE_SIZE-1)/TILE_SIZE,(N+TILE_SIZE-1)/TILE_SIZE);

    for(int t=0;t<T;t++) {
        heat_diffusion_kernel<<<grid,block,0,stream>>>(d_u,d_u_new,N);
        double *tmp = d_u;
        d_u = d_u_new;
        d_u_new = tmp;
    }

    CHECK_CUDA(cudaMemcpyAsync(u, d_u, N*N*sizeof(double), cudaMemcpyDeviceToHost, stream));
    CHECK_CUDA(cudaStreamSynchronize(stream));

    for(int i=0;i<N;i++){
        for(int j=0;j<N;j++)
            std::cout << u[i*N+j] << " ";
        std::cout << std::endl;
    }

    CHECK_CUDA(cudaStreamDestroy(stream));
    CHECK_CUDA(cudaFree(d_u));
    CHECK_CUDA(cudaFree(d_u_new));
    CHECK_CUDA(cudaFreeHost(u));
    CHECK_CUDA(cudaFreeHost(u_new));

    return 0;
}


Overwriting heat_diffusion.cu


In [15]:
!nvcc heat_diffusion.cu -o heat_diffusion


In [16]:
!./heat_diffusion


100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 
100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 
100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 
100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0