In [8]:
%%writefile CMakeLists.txt
cmake_minimum_required(VERSION 3.5)
project(CudaOpenCVProject LANGUAGES CXX CUDA)

# Find OpenCV
find_package (OpenCV 4.0.0 REQUIRED)
include_directories ("/usr/include/opencv4/")

# Set CUDA architecture (change according to your GPU architecture)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_75)

# Specify include directories
include_directories(${OpenCV_INCLUDE_DIRS})

# Add CUDA executable with explicitly specifying source file
add_executable(CudaImage CudaImage.cu)

# Set CUDA architectures property for the target (replace with your GPU's compute capability)
set_target_properties(CudaImage PROPERTIES CUDA_ARCHITECTURES 75)

# Link OpenCV libraries
target_link_libraries(CudaImage ${OpenCV_LIBS})

Overwriting CMakeLists.txt


In [9]:
%%writefile CudaImage.cu
#include <iostream>
#include <chrono>
#include </usr/include/opencv4/opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <cmath>
#include <boost/filesystem.hpp>


// Kernel to calculate Gaussian weights
__device__ float gaussianWeight(float x, float y, float sigma) {
    float sigma2 = 2.0f * sigma * sigma;
    float t = (x * x + y * y) / sigma2;
    return exp(-t) / (M_PI * sigma2);
}

// CUDA kernel for Gaussian blur
__global__ void gaussianBlurCUDA(const unsigned char* input, unsigned char* output,
                                 int width, int height, float sigma) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        float sum = 0.0f;
        float totalWeight = 0.0f;

        // Sample 3x3 neighborhood for simplicity... you can increase the kernel size
        for (int dy = -1; dy <= 1; dy++) {
            for (int dx = -1; dx <= 1; dx++) {
                int pixelX = x + dx;
                int pixelY = y + dy;

                if (pixelX >= 0 && pixelX < width && pixelY >= 0 && pixelY < height) {
                    float weight = gaussianWeight(dx, dy, sigma);
                    sum += input[pixelY * width + pixelX] * weight;
                    totalWeight += weight;
                }
            }
        }
        // Normalize and cast to unsigned char before assigning to output
        output[y * width + x] = (unsigned char)((sum / totalWeight) + 0.5f);  // Add 0.5 for rounding
    }
}

int main() {
    // Image Path
    std::string imagePath = "/content/SampleImage.jpeg"; // Replace with your uploaded image path
    cv::Mat image = cv::imread(imagePath, cv::IMREAD_GRAYSCALE);
    if (image.empty()) {
        std::cerr << "OpenCV version: " << CV_VERSION << std::endl;
        std::cerr << "Image load failed!" << std::endl;
        return -1;
    }

    cv::Mat blurredImageGPU(image.size(), image.type());

    // Allocate device memory
    unsigned char *d_input, *d_output;
    cudaError_t err = cudaMalloc(&d_input, image.total());
    if (err != cudaSuccess) {
        std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err);
        return -1;
    }
    err = cudaMalloc(&d_output, image.total());
    if (err != cudaSuccess) {
        std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err);
        return -1;
    }

    // Copy input image to device
    err = cudaMemcpy(d_input, image.data, image.total(), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err);
        return -1;
    }

    // Kernel launch configuration
    dim3 blockSize(16, 16);  // 2D block
    dim3 gridSize((image.cols + blockSize.x - 1) / blockSize.x,
                  (image.rows + blockSize.y - 1) / blockSize.y);  // 2D grid

    // CUDA Gaussian Blur
    auto startGPU = std::chrono::high_resolution_clock::now();
    gaussianBlurCUDA<<<gridSize, blockSize>>>(d_input, d_output, image.cols, image.rows, 3.0);
    cudaDeviceSynchronize();
    auto endGPU = std::chrono::high_resolution_clock::now();

    // Calculate execution times
    auto gpuDuration = std::chrono::duration<double, std::milli>(endGPU - startGPU).count();
    std::cout << "GPU Time: " << gpuDuration << " ms" << std::endl;

    // Copy output image back to host
    cudaMemcpy(blurredImageGPU.data, d_output, image.total(), cudaMemcpyDeviceToHost);

    // Get the extension of the input image
    boost::filesystem::path p(imagePath);
    std::string extension = imagePath.substr(imagePath.find_last_of("."));

    // Save ONLY blurred images
    cv::imwrite("/content/gpu_blurred_image" + extension, blurredImageGPU);

    // Display confirmation message
    std::cout << "Blurred image saved as: \n"
              << " - gpu_blurred_image" + extension + "\n";

    // Release memory
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

Overwriting CudaImage.cu


In [10]:
!cmake .
!make .

-- Configuring done (0.0s)
-- Generating done (0.0s)
-- Build files have been written to: /content


In [11]:
!nvcc `pkg-config --cflags --libs opencv4` CudaImage.cu -o CudaImage -I/usr/include/opencv4

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^



In [12]:
!./CudaImage

GPU Time: 0.411477 ms
Blurred image saved as: 
 - gpu_blurred_image.jpeg
