In [1]:
%%writefile CMakeLists.txt
cmake_minimum_required(VERSION 3.5)
project(CudaOpenCVProject LANGUAGES CXX CUDA)

# Find OpenCV
find_package(OpenCV REQUIRED)

# Set CUDA architecture (change according to your GPU architecture)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_75)

# Specify include directories
include_directories(${OpenCV_INCLUDE_DIRS})

# Add CUDA executable with explicitly specifying source file
add_executable(CudaImage CudaImage.cu)

# Set CUDA architectures property for the target (replace with your GPU's compute capability)
set_target_properties(CudaImage PROPERTIES CUDA_ARCHITECTURES 75)

# Link OpenCV libraries
target_link_libraries(CudaImage ${OpenCV_LIBS})

Writing CMakeLists.txt


In [2]:
%%writefile CudaImage.cu
#include <iostream>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <cmath>

// Kernel to calculate Gaussian weights
__device__ float gaussianWeight(float x, float y, float sigma) {
    float sigma2 = 2.0f * sigma * sigma;
    float t = (x * x + y * y) / sigma2;
    return exp(-t) / (M_PI * sigma2);
}

// CUDA kernel for Gaussian blur
__global__ void gaussianBlurCUDA(const unsigned char* input, unsigned char* output,
                                 int width, int height, float sigma) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        float sum = 0.0f;
        float totalWeight = 0.0f;

        // Sample 3x3 neighborhood for simplicity... you can increase the kernel size
        for (int dy = -1; dy <= 1; dy++) {
            for (int dx = -1; dx <= 1; dx++) {
                int pixelX = x + dx;
                int pixelY = y + dy;

                if (pixelX >= 0 && pixelX < width && pixelY >= 0 && pixelY < height) {
                    float weight = gaussianWeight(dx, dy, sigma);
                    sum += input[pixelY * width + pixelX] * weight;
                    totalWeight += weight;
                }
            }
        }
        // Normalize and cast to unsigned char before assigning to output
        output[y * width + x] = (unsigned char)((sum / totalWeight) + 0.5f);  // Add 0.5 for rounding
    }
}

int main() {
    // Image Path
    std::string imagePath = "/content/SampleImage.jpg"; // Replace with your uploaded image path
    cv::Mat image = cv::imread(imagePath, cv::IMREAD_GRAYSCALE);
    if (image.empty()) {
        std::cerr << "OpenCV version: " << CV_VERSION << std::endl;
        std::cerr << "Image load failed!" << std::endl;
        return -1;
    }

    cv::Mat blurredImageCPU(image.size(), image.type());
    cv::Mat blurredImageGPU(image.size(), image.type());

    // CPU Gaussian Blur (for timing comparison)
    auto startCPU = std::chrono::high_resolution_clock::now();
    cv::GaussianBlur(image, blurredImageGPU, cv::Size(3, 3), 3.0);
    auto endCPU = std::chrono::high_resolution_clock::now();

    // Allocate device memory
    unsigned char *d_input, *d_output;
    cudaMalloc(&d_input, image.total());
    cudaMalloc(&d_output, image.total());

    // Copy input image to device
    cudaMemcpy(d_input, image.data, image.total(), cudaMemcpyHostToDevice);

    // Kernel launch configuration
    dim3 blockSize(16, 16);  // 2D block
    dim3 gridSize((image.cols + blockSize.x - 1) / blockSize.x,
                  (image.rows + blockSize.y - 1) / blockSize.y);  // 2D grid

    // CUDA Gaussian Blur
    auto startGPU = std::chrono::high_resolution_clock::now();
    gaussianBlurCUDA<<<gridSize, blockSize>>>(d_input, d_output, image.cols, image.rows, 3.0);
    cudaDeviceSynchronize();
    auto endGPU = std::chrono::high_resolution_clock::now();

    // Copy result back to host for CPU
    cv::Mat blurredImageHost(image.size(), image.type());
    cudaMemcpy(blurredImageHost.data, d_output, image.total(), cudaMemcpyDeviceToHost);

    // Calculate execution times
    auto cpuDuration = std::chrono::duration<double, std::milli>(endCPU - startCPU).count();
    auto gpuDuration = std::chrono::duration<double, std::milli>(endGPU - startGPU).count();

    std::cout << "CPU Time: " << cpuDuration << " ms" << std::endl;
    std::cout << "GPU Time: " << gpuDuration << " ms" << std::endl;

    // Save ONLY blurred images
    cv::imwrite("/content/cpu_blurred_image.jpg", blurredImageHost);
    cv::imwrite("/content/gpu_blurred_image.jpg", blurredImageGPU);

    // Display confirmation message
    std::cout << "Blurred images saved as: \n"
              << " - cpu_blurred_image.jpg\n"
              << " - gpu_blurred_image.jpg\n";

    // Release memory
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

Writing CudaImage.cu


In [5]:
!mkdir -p build
%cd build
!cmake ..
!make

/content/build
-- The CXX compiler identification is GNU 11.4.0
-- The CUDA compiler identification is NVIDIA 12.2.140
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Found OpenCV: /usr (found version "4.5.4") 
-- Configuring done (4.3s)
-- Generating done (0.0s)
-- Build files have been written to: /content/build
[ 50%] [32mBuilding CUDA object CMakeFiles/CudaImage.dir/CudaImage.cu.o[0m
[100%] [32m[1mLinking CUDA executable CudaImage[0m
[100%] Built target CudaImage


In [6]:
!./CudaImage

CPU Time: 22.5198 ms
GPU Time: 0.48942 ms
Blurred images saved as: 
 - cpu_blurred_image.jpg
 - gpu_blurred_image.jpg
