<a href="https://colab.research.google.com/github/ishaanagarwal11/Sample/blob/main/CUDA%20Image%20Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-k9dnbz6k
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-k9dnbz6k
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10740 sha256=4bbc71ae6f58940e713435e47d01f445cd32bbd97f6535216d51b147c07d7d08
  Stored in directory: /tmp/pip-ephem-wheel-cache-2s8al2_n/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [None]:
%load_ext nvcc4jupyter


Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpg69odhzb".


In [None]:
%%writefile sobelEdgeDetectionFilter.cu
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

__global__ void sobelFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    float Kx[3][3] = { -1, 0, 1, -2, 0, 2, -1, 0, 1 };
    float Ky[3][3] = { 1, 2, 1, 0, 0, 0, -1, -2, -1 };

    // only threads inside image will write results
    if ((x >= 3 / 2) && (x < (width - 3 / 2)) && (y >= 3 / 2) && (y < (height - 3 / 2))) {
        // Gradient in x-direction
        float Gx = 0;
        // Loop inside the filter to average pixel values
        for (int ky = -3 / 2; ky <= 3 / 2; ky++) {
            for (int kx = -3 / 2; kx <= 3 / 2; kx++) {
                float fl = srcImage[((y + ky) * width + (x + kx))];
                Gx += fl * Kx[ky + 3 / 2][kx + 3 / 2];
            }
        }
        float Gx_abs = Gx < 0 ? -Gx : Gx;

        // Gradient in y-direction
        float Gy = 0;
        // Loop inside the filter to average pixel values
        for (int ky = -3 / 2; ky <= 3 / 2; ky++) {
            for (int kx = -3 / 2; kx <= 3 / 2; kx++) {
                float fl = srcImage[((y + ky) * width + (x + kx))];
                Gy += fl * Ky[ky + 3 / 2][kx + 3 / 2];
            }
        }
        float Gy_abs = Gy < 0 ? -Gy : Gy;

        dstImage[(y * width + x)] = Gx_abs + Gy_abs;
    }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * sizeof(unsigned char);

    // Allocate host memory for output image
    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    // Allocate device memory
    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    // Define CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Launch kernel
    dim3 blockSize(16, 16);
    dim3 gridSize(ceil(width / 16.0), ceil(height / 16.0));

    cudaEventRecord(start);
    sobelFilter<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height);
    cudaEventRecord(stop);

    // Synchronize events
    cudaEventSynchronize(stop);

    // Calculate elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result back to host
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    // Write output image
    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output_sobel.jpeg", outputImage);

    // Free memory
    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", milliseconds);

    return 0;
}


Writing sobelEdgeDetectionFilter.cu


In [None]:
!nvcc -o sobelEdgeDetectionFilter sobelEdgeDetectionFilter.cu `pkg-config --cflags --libs opencv4`

In [None]:
!./sobelEdgeDetectionFilter

Total time taken: 0.198688 milliseconds


In [None]:
%%writefile sobelEdgeDetectionFilter.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

void sobelFilter(unsigned char *srcImage, unsigned char *dstImage, int width, int height) {
    float Kx[3][3] = { -1, 0, 1, -2, 0, 2, -1, 0, 1 };
    float Ky[3][3] = { 1, 2, 1, 0, 0, 0, -1, -2, -1 };

    // Apply Sobel filter
    for (int y = 1; y < height - 1; y++) {
        for (int x = 1; x < width - 1; x++) {
            // Gradient in x-direction
            float Gx = 0;
            // Loop inside the filter to average pixel values
            for (int ky = -1; ky <= 1; ky++) {
                for (int kx = -1; kx <= 1; kx++) {
                    float fl = srcImage[(y + ky) * width + (x + kx)];
                    Gx += fl * Kx[ky + 1][kx + 1];
                }
            }
            float Gx_abs = Gx < 0 ? -Gx : Gx;

            // Gradient in y-direction
            float Gy = 0;
            // Loop inside the filter to average pixel values
            for (int ky = -1; ky <= 1; ky++) {
                for (int kx = -1; kx <= 1; kx++) {
                    float fl = srcImage[(y + ky) * width + (x + kx)];
                    Gy += fl * Ky[ky + 1][kx + 1];
                }
            }
            float Gy_abs = Gy < 0 ? -Gy : Gy;

            dstImage[y * width + x] = (unsigned char)(Gx_abs + Gy_abs);
        }
    }
}

int main() {
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * sizeof(unsigned char);

    // Allocate memory for output image
    unsigned char *outputImage = (unsigned char *)malloc(imageSize);
    if (outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate memory\n");
        return -1;
    }

    // Convert input image to grayscale and copy data to input buffer
    unsigned char *inputImage = image.data;

    // Timing
    struct timeval start, end;
    gettimeofday(&start, NULL);

    // Apply Sobel filter
    sobelFilter(inputImage, outputImage, width, height);

    // Timing
    gettimeofday(&end, NULL);
    float elapsed = (end.tv_sec - start.tv_sec) * 1000.0; // sec to ms
    elapsed += (end.tv_usec - start.tv_usec) / 1000.0;   // us to ms

    // Write output image
    Mat output(height, width, CV_8UC1, outputImage);
    imwrite("output_sobel_cpu.jpeg", output);

    // Free memory
    free(outputImage);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", elapsed);

    return 0;
}


Writing sobelEdgeDetectionFilter.c


In [None]:
!g++ -o sobelEdgeDetectionFilterC sobelEdgeDetectionFilter.c `pkg-config --cflags --libs opencv4`

In [None]:
!./sobelEdgeDetectionFilterC

Total time taken: 74.099998 milliseconds


In [None]:
%%writefile blur.cu
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

__global__ void boxFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Define a larger kernel size for increased blur intensity
    const int kernelSize = 20;
    const int halfKernelSize = kernelSize / 2;

    // only threads inside image will write results
    if ((x >= halfKernelSize) && (x < width - halfKernelSize) && (y >= halfKernelSize) && (y < height - halfKernelSize))
    {
        for (int c = 0; c < channel; c++)
        {
            // Sum of pixel values
            float sum = 0;
            // Number of filter pixels
            float kS = 0;
            // Loop inside the filter to average pixel values
            for (int ky = -halfKernelSize; ky <= halfKernelSize; ky++)
            {
                for (int kx = -halfKernelSize; kx <= halfKernelSize; kx++)
                {
                    float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                    sum += fl;
                    kS += 1;
                }
            }
            dstImage[(y * width + x) * channel + c] = sum / kS;
        }
    }
}

void checkCudaErrors(cudaError_t r)
{
    if (r != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main()
{
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    int channel = image.channels();
    size_t imageSize = width * height * channel * sizeof(unsigned char);

    // Allocate host memory for output image
    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    // Allocate device memory
    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    // Define CUDA events for timing
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // Launch kernel
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    checkCudaErrors(cudaEventRecord(start));
    boxFilter<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height, channel);
    checkCudaErrors(cudaEventRecord(stop));

    // Synchronize events
    checkCudaErrors(cudaEventSynchronize(stop));

    // Calculate elapsed time
    float milliseconds = 0;
    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));

    // Copy result back to host
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    // Write output image
    Mat outputImage(height, width, CV_8UC3, h_outputImage);
    imwrite("output_blur.jpeg", outputImage);

    // Free memory
    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", milliseconds);

    return 0;
}


Overwriting blur.cu


In [None]:
!nvcc -o blur blur.cu `pkg-config --cflags --libs opencv4` && ./blur

Total time taken: 4.703936 milliseconds


In [None]:
%%writefile blur.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

void boxFilter(unsigned char *srcImage, unsigned char *dstImage, int width, int height, int channel) {
    // Define a larger kernel size for increased blur intensity
    const int kernelSize = 20;
    const int halfKernelSize = kernelSize / 2;

    for (int y = halfKernelSize; y < height - halfKernelSize; y++) {
        for (int x = halfKernelSize; x < width - halfKernelSize; x++) {
            for (int c = 0; c < channel; c++) {
                float sum = 0;
                float kS = 0;
                for (int ky = -halfKernelSize; ky <= halfKernelSize; ky++) {
                    for (int kx = -halfKernelSize; kx <= halfKernelSize; kx++) {
                        float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                        sum += fl;
                        kS += 1;
                    }
                }
                dstImage[(y * width + x) * channel + c] = sum / kS;
            }
        }
    }
}

int main() {
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    int channel = image.channels();
    size_t imageSize = width * height * channel * sizeof(unsigned char);

    // Allocate memory for output image
    unsigned char *outputImage = (unsigned char *)malloc(imageSize);
    if (outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate memory\n");
        return -1;
    }

    // Convert input image to grayscale and copy data to input buffer
    unsigned char *inputImage = image.data;

    // Timing
    struct timeval start, end;
    gettimeofday(&start, NULL);

    // Apply box filter
    boxFilter(inputImage, outputImage, width, height, channel);

    // Timing
    gettimeofday(&end, NULL);
    float elapsed = (end.tv_sec - start.tv_sec) * 1000.0; // sec to ms
    elapsed += (end.tv_usec - start.tv_usec) / 1000.0;   // us to ms

    // Write output image
    Mat output(height, width, CV_8UC3, outputImage);
    imwrite("output_blur_cpu.jpeg", output);

    // Free memory
    free(outputImage);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", elapsed);

    return 0;
}


Overwriting blur.c


In [None]:
!g++ -o blurC blur.c `pkg-config --cflags --libs opencv4` && ./blurC

Total time taken: 3278.642090 milliseconds


In [None]:
%%writefile sharpening.cu
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

__global__ void sharpeningFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    float kernel[3][3] = {{-1, -1, -1}, {-1, 9, -1}, {-1, -1, -1}};

    // only threads inside image will write results
    if ((x >= 1) && (x < width - 1) && (y >= 1) && (y < height - 1))
    {
        for (int c = 0; c < channel; c++)
        {
            // Sum of pixel values
            float sum = 0;
            // Loop inside the filter to apply the kernel
            for (int ky = -1; ky <= 1; ky++)
            {
                for (int kx = -1; kx <= 1; kx++)
                {
                    float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                    sum += fl * kernel[ky + 1][kx + 1];
                }
            }
            dstImage[(y * width + x) * channel + c] = sum;
        }
    }
}

void checkCudaErrors(cudaError_t r)
{
    if (r != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main()
{
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    int channel = image.channels();
    size_t imageSize = width * height * sizeof(unsigned char);

    // Allocate host memory for output image
    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    // Allocate device memory
    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    // Define CUDA events for timing
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // Launch kernel
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    checkCudaErrors(cudaEventRecord(start));
    sharpeningFilter<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height, channel);
    checkCudaErrors(cudaEventRecord(stop));

    // Synchronize events
    checkCudaErrors(cudaEventSynchronize(stop));

    // Calculate elapsed time
    float milliseconds = 0;
    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));

    // Copy result back to host
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    // Write output image
    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output_sharpened.jpeg", outputImage);

    // Free memory
    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", milliseconds);

    return 0;
}


Overwriting sharpening.cu


In [None]:
!nvcc -o sharpening sharpening.cu `pkg-config --cflags --libs opencv4` && ./sharpening

Total time taken: 0.206752 milliseconds


In [None]:
%%writefile sharpening.c
#include <stdio.h>
#include <stdlib.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

void sharpeningFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
    float kernel[3][3] = {{-1, -1, -1}, {-1, 9, -1}, {-1, -1, -1}};

    for (int y = 1; y < height - 1; ++y)
    {
        for (int x = 1; x < width - 1; ++x)
        {
            for (int c = 0; c < channel; ++c)
            {
                float sum = 0;
                for (int ky = -1; ky <= 1; ++ky)
                {
                    for (int kx = -1; kx <= 1; ++kx)
                    {
                        float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                        sum += fl * kernel[ky + 1][kx + 1];
                    }
                }
                dstImage[(y * width + x) * channel + c] = sum;
            }
        }
    }
}

int main()
{
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;
    int channel = image.channels();
    size_t imageSize = width * height * sizeof(unsigned char);

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *inputImage = image.data;

    // Define CPU events for timing
    clock_t start, end;
    double cpu_time_used;

    // Start time
    start = clock();

    // Apply sharpening filter
    sharpeningFilter(inputImage, h_outputImage, width, height, channel);

    // End time
    end = clock();

    // Calculate elapsed time
    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC * 1000.0;

    // Write output image
    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output_sharpened.jpeg", outputImage);

    // Free memory
    free(h_outputImage);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", cpu_time_used);

    return 0;
}



Writing sharpening.c


In [None]:
!g++ -o sharpening sharpening.c `pkg-config --cflags --libs opencv4` -std=c++11 && ./sharpening

Total time taken: 78.918000 milliseconds


In [None]:
%%writefile red.cu
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

__global__ void redChannelManipulation(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // only threads inside image will write results
    if ((x >= 0) && (x < width) && (y >= 0) && (y < height))
    {
        // Get the red channel value
        unsigned char red = srcImage[(y * width + x) * 3 + 2]; // BGR ordering: red channel is at index 2

        // Modify the red channel value (for example, increasing its intensity)
        // You can apply any desired manipulation here
        red = min(255, red * 2); // Example: doubling the intensity, capped at 255

        // Write the modified red channel value to the output image
        dstImage[(y * width + x) * 3 + 2] = red; // BGR ordering: red channel is at index 2
    }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * 3 * sizeof(unsigned char); // 3 channels (BGR)

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    // Define CUDA events for timing
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // Launch kernel
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    checkCudaErrors(cudaEventRecord(start));
    redChannelManipulation<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height);
    checkCudaErrors(cudaEventRecord(stop));

    // Synchronize events
    checkCudaErrors(cudaEventSynchronize(stop));

    // Calculate elapsed time
    float milliseconds = 0;
    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));

    // Copy result back to host
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    // Write output image
    Mat outputImage(height, width, CV_8UC3, h_outputImage); // 3 channels (BGR)
    imwrite("output_red_modified.jpeg", outputImage);

    // Free memory
    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", milliseconds);

    return 0;
}


Writing red.cu


In [None]:
!nvcc -o red red.cu `pkg-config --cflags --libs opencv4` && ./red

Total time taken: 0.239840 milliseconds


In [None]:
%%writefile red.c
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

void redChannelManipulation(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
    for (int y = 0; y < height; ++y)
    {
        for (int x = 0; x < width; ++x)
        {
            // Get the red channel value
            unsigned char red = srcImage[(y * width + x) * 3 + 2]; // BGR ordering: red channel is at index 2

            // Modify the red channel value (for example, increasing its intensity)
            // You can apply any desired manipulation here
            red = std::min(255, red * 2); // Example: doubling the intensity, capped at 255

            // Write the modified red channel value to the output image
            dstImage[(y * width + x) * 3 + 2] = red; // BGR ordering: red channel is at index 2
        }
    }
}

int main()
{
    // Read input image
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * 3 * sizeof(unsigned char); // 3 channels (BGR)

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    // Define CPU time variables
    clock_t start, end;
    double cpu_time_used;

    start = clock(); // Start measuring CPU time

    // Perform red channel manipulation
    redChannelManipulation(image.data, h_outputImage, width, height);

    end = clock(); // Stop measuring CPU time

    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC * 1000.0; // Calculate CPU time in milliseconds

    // Write output image
    Mat outputImage(height, width, CV_8UC3, h_outputImage); // 3 channels (BGR)
    imwrite("output_red_modified.jpeg", outputImage);

    // Free memory
    free(h_outputImage);

    // Print elapsed time
    printf("Total time taken: %f milliseconds\n", cpu_time_used);

    return 0;
}


Writing red.c


In [None]:
!g++ -o red red.c `pkg-config --cflags --libs opencv4` && ./red

Total time taken: 12.235000 milliseconds
