#Conways game of life in CUDA

#Notes
1)To make it inot an file we use:
"%%writefile buu.cu"  


2)We compile with the line  

"!nvcc -arch=sm_70 -o buu buu.cu"

3)and we run it with the line  

"!./buu"

The below lines are used to used to install the library that makes it compariblw to run the code on the jupyter notebook.

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-1r5cfy4c
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-1r5cfy4c
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10742 sha256=fdb30b442a03a2b978d94f242f4f3839c3b517111d4c5bcc31496f40b5aacbba
  Stored in directory: /tmp/pip-ephem-wheel-cache-51r9cqej/wheels/ef/1d/c6/f7e47f1aa1bc9d05c4120d94f90a79cf28603ef343b0dd43ff
Successfully bu

In [None]:
%load_ext nvcc4jupyter


Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpl08l6myl".


Run this to check if the library is installed properly or not.


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


The below code is simple demostration of how CUDA runs on the notebook.

In [None]:
%%cuda
#include <iostream>
    int
    main()
{
    std::cout << "Welcome To world\n";
    return 0;
}


Welcome To world



# GPU

The below code is for GPU

In [19]:
%%writefile buu.cpp
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <random>

#define WIDTH 512    // size of the grid and no of iterations
#define HEIGHT 512
#define ITERATIONS 1000

__device__ int count_neighbors(int* grid, int x, int y, int width, int height) {
    int count = 0;
    for (int dx = -1; dx <= 1; ++dx) {
        for (int dy = -1; dy <= 1; ++dy) {
            if (dx == 0 && dy == 0) continue;
            int nx = (x + dx + width) % width;
            int ny = (y + dy + height) % height;
            count += grid[ny * width + nx];
        }
    }
    return count;
}

__global__ void game_of_life_step(int* current, int* next, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = y * width + x;
        int neighbors = count_neighbors(current, x, y, width, height);
        int state = current[idx];
        if (state == 1 && (neighbors == 2 || neighbors == 3)) {
            next[idx] = 1;
        } else if (state == 0 && neighbors == 3) {
            next[idx] = 1;
        } else {
            next[idx] = 0;
        }
    }
}



void set_random_dense_pattern(int* grid, int width, int height, float fill_ratio = 0.6f) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::bernoulli_distribution d(fill_ratio);

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            grid[y * width + x] = d(gen) ? 1 : 0;
        }
    }
}

int main() {
    int size = WIDTH * HEIGHT * sizeof(int);

    int* h_grid = new int[WIDTH * HEIGHT]();
    int* h_result = new int[WIDTH * HEIGHT]();

    // Fill about 60% of the screen with live cells
    set_random_dense_pattern(h_grid, WIDTH, HEIGHT, 0.6f);


    int* d_current;
    int* d_next;
    cudaMalloc(&d_current, size);
    cudaMalloc(&d_next, size);
    cudaMemcpy(d_current, h_grid, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((WIDTH + 15) / 16, (HEIGHT + 15) / 16);

    // CUDA timing setup
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // gPU timing start
    auto gpu_start = std::chrono::high_resolution_clock::now();

    cudaEventRecord(start); // GPU timing start

    for (int i = 0; i < ITERATIONS; ++i) {
        game_of_life_step<<<numBlocks, threadsPerBlock>>>(d_current, d_next, WIDTH, HEIGHT);
        cudaDeviceSynchronize(); // Wait for kernel to finish

        std::swap(d_current, d_next);
        cudaMemcpy(h_result, d_current, size, cudaMemcpyDeviceToHost);
        //print_grid(h_result, WIDTH, HEIGHT);
    }

    cudaEventRecord(stop);            // GPU timing stop
    cudaEventSynchronize(stop);       // Ensure stop event completed

    // gPU timing end
    auto gpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> gpu_duration = gpu_end - gpu_start;

    // Calculate GPU time
    // Output timing
    std::cout << "GPU kernel time: " << gpu_duration.count() << " ms\n";

    // Cleanup
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_current);
    cudaFree(d_next);
    delete[] h_grid;
    delete[] h_result;

    return 0;
}


Writing buu.cpp


In [20]:
!mv buu.cpp buu.cu
!nvcc -arch=sm_70 -o buu buu.cu

In [21]:
!./buu

GPU kernel time: 1838.96 ms


#CPU TIME

The below code is for CPU

In [22]:
%%writefile buu_cpu.cpp
#include <iostream>
#include <chrono>
#include <random>

#define WIDTH 512
#define HEIGHT 512
#define ITERATIONS 1000



void print_grid(int* grid) {
    for (int y = 0; y < HEIGHT; ++y) {
        for (int x = 0; x < WIDTH; ++x) {
            std::cout << (grid[y * WIDTH + x] ? '0' : '.');
        }
        std::cout << '\n';
    }
    std::cout << std::string(WIDTH, '=') << '\n';
}



void set_random_dense_pattern(int* grid, float fill_ratio = 0.6f) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::bernoulli_distribution d(fill_ratio);

    for (int y = 0; y < HEIGHT; ++y) {
        for (int x = 0; x < WIDTH; ++x) {
            grid[y * WIDTH + x] = d(gen) ? 1 : 0;
        }
    }
}

int main() {
    int size = WIDTH * HEIGHT * sizeof(int);

    int* current = new int[WIDTH * HEIGHT]();
    int* next = new int[WIDTH * HEIGHT]();

    // Fill about 60% of the screen with live cells
    set_random_dense_pattern(current, 0.6f);



    auto cpu_start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < ITERATIONS; ++i) {
      for(int y = 0; y<HEIGHT; y++) {
        for(int x = 0; x<WIDTH; x++) {

          int idx = y * WIDTH + x;

          int neighbors = 0;
          for (int dx = -1; dx <= 1; ++dx) {
              for (int dy = -1; dy <= 1; ++dy) {
                  if (dx == 0 && dy == 0) continue;
                  int nx = (x + dx + WIDTH) % WIDTH;
                  int ny = (y + dy + HEIGHT) % HEIGHT;
                  neighbors += current[ny * WIDTH + nx];
              }
          }


        int state = current[idx];
        if (state == 1 && (neighbors == 2 || neighbors == 3)) {
            next[idx] = 1;
        } else if (state == 0 && neighbors == 3) {
            next[idx] = 1;
        } else {
            next[idx] = 0;
        }
        }
      }

      int * temp = current;
      current = next;
      next = temp;

    }


    // CPU timing end
    auto cpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> cpu_duration = cpu_end - cpu_start;



    // Output timing
    std::cout << "CPU total time: " << cpu_duration.count() << " ms" << std::endl;


    // Cleanup
    // cudaEventDestroy(start);
    // cudaEventDestroy(stop);
    // cudaFree(current);
    // cudaFree(next);
    delete[] current;
    delete[] next;

    return 0;
}


Overwriting buu_cpu.cpp


In [23]:
!g++ -o buu buu_cpu.cpp

In [24]:
!./buu

CPU total time: 53306.3 ms


# Animation

For animation,we have reduced the number of iterations.

In [28]:
%%writefile ani.cu
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <random>

#define WIDTH 512
#define HEIGHT 512
#define ITERATIONS 10

__device__ int count_neighbors(int* grid, int x, int y, int width, int height) {
    int count = 0;
    for (int dx = -1; dx <= 1; ++dx) {
        for (int dy = -1; dy <= 1; ++dy) {
            if (dx == 0 && dy == 0) continue;
            int nx = (x + dx + width) % width;
            int ny = (y + dy + height) % height;
            count += grid[ny * width + nx];
        }
    }
    return count;
}

__global__ void game_of_life_step(int* current, int* next, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = y * width + x;
        int neighbors = count_neighbors(current, x, y, width, height);
        int state = current[idx];
        if (state == 1 && (neighbors == 2 || neighbors == 3)) {
            next[idx] = 1;
        } else if (state == 0 && neighbors == 3) {
            next[idx] = 1;
        } else {
            next[idx] = 0;
        }
    }
}

void print_grid(int* grid, int width, int height) {
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            std::cout << (grid[y * width + x] ? '0' : '.');
        }
        std::cout << '\n';
    }
    std::cout << std::string(width, '=') << '\n';
}

void set_random_dense_pattern(int* grid, int width, int height, float fill_ratio = 0.6f) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::bernoulli_distribution d(fill_ratio);

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            grid[y * width + x] = d(gen) ? 1 : 0;
        }
    }
}

int main() {
    int size = WIDTH * HEIGHT * sizeof(int);

    int* h_grid = new int[WIDTH * HEIGHT]();
    int* h_result = new int[WIDTH * HEIGHT]();

    // Fill about 60% of the screen with live cells
    set_random_dense_pattern(h_grid, WIDTH, HEIGHT, 0.6f);

    int* d_current;
    int* d_next;
    cudaMalloc(&d_current, size);
    cudaMalloc(&d_next, size);
    cudaMemcpy(d_current, h_grid, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((WIDTH + 15) / 16, (HEIGHT + 15) / 16);

    // CUDA timing setup
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // CPU timing start
    auto cpu_start = std::chrono::high_resolution_clock::now();

    cudaEventRecord(start); // GPU timing start

    for (int i = 0; i < ITERATIONS; ++i) {
        game_of_life_step<<<numBlocks, threadsPerBlock>>>(d_current, d_next, WIDTH, HEIGHT);
        cudaDeviceSynchronize(); // Wait for kernel to finish

        std::swap(d_current, d_next);
        cudaMemcpy(h_result, d_current, size, cudaMemcpyDeviceToHost);
        print_grid(h_result, WIDTH, HEIGHT);
    }

    cudaEventRecord(stop);            // GPU timing stop
    cudaEventSynchronize(stop);       // Ensure stop event completed

    // CPU timing end
    auto cpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> cpu_duration = cpu_end - cpu_start;

    // Calculate GPU time
    float gpu_milliseconds = 0.0f;
    cudaEventElapsedTime(&gpu_milliseconds, start, stop);

    // Output timing


    // Cleanup
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_current);
    cudaFree(d_next);
    delete[] h_grid;
    delete[] h_result;

    return 0;
}



Overwriting ani.cu


In [29]:
!nvcc -arch=sm_70 -o ani ani.cu

In [30]:
!./ani

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
.........0..................................0...........................0.....................0........0........0....0..00...0.0......0.0000......0000......0.0....................00............0.0......0.0...............00.....0.......000................................0.0.0.0.0.0.0.0..........0.....0.......0..........0.......000...........0.....................................0.000.....................0....00..0.0.........0.......00................00.......0.0....0......0...0.....000..............0.0......
...................................00.0................0.........0.0.........00.............0..................0.......000...0.0......00.0.0............0.0.0.0.00.0...0....00.0...........00....0.0.0.......0....0....00.0.00.....0.00....0.......................................0........................000................0.......0000...........0..........0.............0.............0....0........0....0.....0.0......0.0....