# HPC Assignment 18
## Image Processing CUDA

## Implement the following Image Processing operations in sequential and parallel using CUDA Programming.

### Gaussian blur

#### Describe Gaussian Blur in brief.

#### Where parallelism can be inserted?

#### Analyze the performance in serial and parallel model.

##### Serial

In [1]:
import os
import cv2
import numpy as np
import time

In [2]:
def gaussian_kernel(size, sigma=1):
    """
    Generate a Gaussian kernel.

    Parameters:
    - size: size of the kernel (odd number)
    - sigma: standard deviation of the Gaussian distribution

    Returns:
    - Gaussian kernel
    """
    kernel = np.fromfunction(lambda x, y: (1/(2*np.pi*sigma**2)) * np.exp(-((x-(size-1)/2)**2 + (y-(size-1)/2)**2)/(2*sigma**2)), (size, size))
    return kernel / np.sum(kernel)

In [3]:
def gaussian_blur_sequential(image, kernel_size):
    """
    Apply Gaussian blur to the image sequentially.

    Parameters:
    - image: input image
    - kernel_size: size of the Gaussian kernel

    Returns:
    - blurred image
    """
    kernel = gaussian_kernel(kernel_size)
    blurred_image = cv2.filter2D(image, -1, kernel)  # Using OpenCV's filter2D for convolution
    return blurred_image

In [4]:
def process_images_in_folder(folder_path, kernel_size):
    """
    Process all images in the given folder by applying Gaussian blur sequentially.

    Parameters:
    - folder_path: path to the folder containing images
    - kernel_size: size of the Gaussian kernel

    Returns:
    - dictionary containing image filenames as keys and time taken for processing as values
    """
    times = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, cv2.IMREAD_COLOR)
            start_time = time.time()
            blurred_image = gaussian_blur_sequential(image, kernel_size)
            end_time = time.time()
            times[filename] = end_time - start_time
    return times

In [7]:
folder_path = "D:/data set/minion/"
kernel_size = 3

In [8]:
start_time_total = time.time()
times_per_image = process_images_in_folder(folder_path, kernel_size)
end_time_total = time.time()
total_time = end_time_total - start_time_total

In [9]:
print("Time taken for Gaussian blur on each image:")
for filename, time_taken in times_per_image.items():
    print(f"{filename}: {time_taken:.4f} seconds")

Time taken for Gaussian blur on each image:
1.jpg: 0.0066 seconds
10.jpg: 0.0000 seconds
11.jpg: 0.0105 seconds
2.jpg: 0.0080 seconds
3.jpg: 0.0780 seconds
4.jpg: 0.0155 seconds
5.jpg: 0.0239 seconds
6.jpg: 0.0000 seconds
7.jpg: 0.0000 seconds
8.jpg: 0.0000 seconds
9.jpg: 0.0000 seconds


In [10]:
print(f"\nTotal time taken for all images: {total_time:.4f} seconds")


Total time taken for all images: 0.5684 seconds


##### Parallel

In [5]:
from numba import cuda
import os
import cv2
import numpy as np
import time

In [2]:
@cuda.jit
def gaussian_blur_parallel(input_image, output_image, mask, width, height):
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    bw = cuda.blockDim.x
    bh = cuda.blockDim.y
    
    row_o = by * bh + ty
    col_o = bx * bw + tx
    row_i = row_o - 2  # RADIUS = 2 for MASK_WIDTH = 5
    col_i = col_o - 2
    
    result = 0.0
    for i in range(5):
        for j in range(5):
            current_row = row_i + i
            current_col = col_i + j
            if current_row >= 0 and current_row < height and current_col >= 0 and current_col < width:
                result += input_image[current_row, current_col] * mask[i, j]
    output_image[row_o, col_o] = result

In [3]:
def process_images_in_folder_parallel_gaussian(folder_path):
    times = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            height, width = image.shape
            output_image = np.zeros_like(image)
            
            # Allocate device memory
            d_input_image = cuda.to_device(image)
            d_output_image = cuda.to_device(output_image)
            d_mask = cuda.to_device(np.array([[1, 4, 6, 4, 1],
                                               [4, 16, 24, 16, 4],
                                               [6, 24, 36, 24, 6],
                                               [4, 16, 24, 16, 4],
                                               [1, 4, 6, 4, 1]]))
            
            # Define grid and block dimensions
            threadsperblock = (16, 16)
            blockspergrid_x = (width + threadsperblock[0] - 1) // threadsperblock[0]
            blockspergrid_y = (height + threadsperblock[1] - 1) // threadsperblock[1]
            blockspergrid = (blockspergrid_x, blockspergrid_y)
            
            start_time = time.time()
            gaussian_blur_parallel[blockspergrid, threadsperblock](d_input_image, d_output_image, d_mask, width, height)
            cuda.synchronize()  # Ensure all threads have finished before measuring time
            end_time = time.time()
            
            output_image = d_output_image.copy_to_host()
            times[filename] = end_time - start_time
    return times

In [16]:
folder_path = "D:/data set/minion/"

In [8]:
start_time_total = time.time()
times_per_image_parallel_gaussian = process_images_in_folder_parallel_gaussian(folder_path)
end_time_total = time.time()
total_time_parallel_gaussian = end_time_total - start_time_total

In [14]:
print("Time taken for parallel Gaussian blur on each image:")
for filename, time_taken in times_per_image_parallel_gaussian.items():
    print(f"{filename}: {time_taken:.4f} seconds")

Time taken for parallel Gaussian blur on each image:
1.jpg: 0.4579 seconds
10.jpg: 0.0000 seconds
11.jpg: 0.0000 seconds
2.jpg: 0.0000 seconds
3.jpg: 0.0158 seconds
4.jpg: 0.0000 seconds
5.jpg: 0.0081 seconds
6.jpg: 0.0000 seconds
7.jpg: 0.0000 seconds
8.jpg: 0.0000 seconds
9.jpg: 0.0000 seconds


In [15]:
print(f"\nTotal time taken for parallel Gaussian blur on all images: {total_time_parallel_gaussian:.4f} seconds")


Total time taken for parallel Gaussian blur on all images: 3.1262 seconds


In [20]:
import os
import cv2
import cupy as cp
import time

In [21]:
# Define Gaussian kernel
def gaussian_kernel(size, sigma=1):
    kernel = cp.fromfunction(lambda x, y: (1/(2*cp.pi*sigma**2)) * cp.exp(-((x-(size-1)/2)**2 + (y-(size-1)/2)**2)/(2*sigma**2)), (size, size))
    return kernel / cp.sum(kernel)

In [None]:
# Parallel Gaussian blur using CUDA
def gaussian_blur_parallel(image, kernel_size):
    kernel = gaussian_kernel(kernel_size)
    image_gpu = cp.asarray(image)
    kernel_gpu = cp.asarray(kernel)
    blurred_image_gpu = cp.zeros_like(image_gpu)

    # CUDA kernel for convolution
    kernel_size = cp.int32(kernel_size)
    image_height, image_width = image_gpu.shape
    threads_per_block = (32, 32)
    blocks_per_grid_x = (image_width + threads_per_block[0] - 1) // threads_per_block[0]
    blocks_per_grid_y = (image_height + threads_per_block[1] - 1) // threads_per_block[1]
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    gaussian_blur_kernel[blocks_per_grid, threads_per_block](image_gpu, blurred_image_gpu, kernel_gpu, kernel_size, image_height, image_width)

    return blurred_image_gpu.get()

In [23]:
# CUDA kernel function for Gaussian blur
@cp.fuse()
def gaussian_blur_kernel(image, blurred_image, kernel, kernel_size, image_height, image_width):
    x, y = cp.meshgrid(cp.arange(-kernel_size//2 + 1, kernel_size//2 + 1), cp.arange(-kernel_size//2 + 1, kernel_size//2 + 1))
    for i in range(image_height):
        for j in range(image_width):
            for k in range(kernel_size):
                for l in range(kernel_size):
                    ii = i + k - kernel_size // 2
                    jj = j + l - kernel_size // 2
                    if ii >= 0 and ii < image_height and jj >= 0 and jj < image_width:
                        blurred_image[i, j] += image[ii, jj] * kernel[k, l]

In [24]:
# Process images in folder using parallel Gaussian blur
def process_images_in_folder_parallel(folder_path, kernel_size):
    times = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            start_time = time.time()
            blurred_image = gaussian_blur_parallel(image, kernel_size)
            end_time = time.time()
            times[filename] = end_time - start_time
    return times

In [25]:
# Set the folder path containing images
folder_path = "D:/data set/minion/"
kernel_size = 5

In [26]:
# Process images and measure time taken for parallel Gaussian blur
start_time_total = time.time()
times_per_image_parallel = process_images_in_folder_parallel(folder_path, kernel_size)
end_time_total = time.time()
total_time_parallel = end_time_total - start_time_total

# Print time taken for parallel Gaussian blur on each image
print("Time taken for parallel Gaussian blur on each image:")
for filename, time_taken in times_per_image_parallel.items():
    print(f"{filename}: {time_taken:.4f} seconds")

# Print total time taken for all images
print(f"\nTotal time taken for parallel Gaussian blur on all images: {total_time_parallel:.4f} seconds")

TypeError: Unsupported type <class 'numpy.ndarray'>

In [29]:
import os
import cv2
import cupy as cp
import time

# Define Gaussian kernel
def gaussian_kernel(size, sigma=1):
    x, y = cp.meshgrid(cp.arange(size), cp.arange(size))
    x = x - (size - 1) / 2
    y = y - (size - 1) / 2
    kernel = cp.exp(-(x**2 + y**2) / (2 * sigma**2)) / (2 * cp.pi * sigma**2)
    return kernel / cp.sum(kernel)

# Parallel Gaussian blur using CUDA
def gaussian_blur_parallel(image, kernel_size):
    kernel = gaussian_kernel(kernel_size)
    image_gpu = cp.asarray(image)
    kernel_gpu = cp.asarray(kernel)
    blurred_image_gpu = cp.zeros_like(image_gpu)

    # CUDA kernel for convolution
    kernel_size = cp.int32(kernel_size)
    image_height, image_width = image_gpu.shape
    threads_per_block = (32, 32)
    blocks_per_grid_x = (image_width + threads_per_block[0] - 1) // threads_per_block[0]
    blocks_per_grid_y = (image_height + threads_per_block[1] - 1) // threads_per_block[1]
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    gaussian_blur_kernel[blocks_per_grid, threads_per_block](image_gpu, blurred_image_gpu, kernel_gpu, kernel_size, image_height, image_width)

    return blurred_image_gpu.get()

# CUDA kernel function for Gaussian blur
@cp.fuse()
def gaussian_blur_kernel(image, blurred_image, kernel, kernel_size, image_height, image_width):
    x, y = cp.meshgrid(cp.arange(-kernel_size//2 + 1, kernel_size//2 + 1), cp.arange(-kernel_size//2 + 1, kernel_size//2 + 1))
    for i in range(image_height):
        for j in range(image_width):
            for k in range(kernel_size):
                for l in range(kernel_size):
                    ii = i + k - kernel_size // 2
                    jj = j + l - kernel_size // 2
                    if ii >= 0 and ii < image_height and jj >= 0 and jj < image_width:
                        blurred_image[i, j] += image[ii, jj] * kernel[k, l]

# Process images in folder using parallel Gaussian blur
def process_images_in_folder_parallel(folder_path, kernel_size):
    times = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            start_time = time.time()
            blurred_image = gaussian_blur_parallel(image, kernel_size)
            end_time = time.time()
            times[filename] = end_time - start_time
    return times

# Set the folder path containing images
folder_path = "D:/data set/minion"
kernel_size = 5

# Process images and measure time taken for parallel Gaussian blur
start_time_total = time.time()
times_per_image_parallel = process_images_in_folder_parallel(folder_path, kernel_size)
end_time_total = time.time()
total_time_parallel = end_time_total - start_time_total

# Print time taken for parallel Gaussian blur on each image
print("Time taken for parallel Gaussian blur on each image:")
for filename, time_taken in times_per_image_parallel.items():
    print(f"{filename}: {time_taken:.4f} seconds")

# Print total time taken for all images
print(f"\nTotal time taken for parallel Gaussian blur on all images: {total_time_parallel:.4f} seconds")


RuntimeError: CuPy failed to load nvrtc64_120_0.dll: FileNotFoundError: Could not find module 'nvrtc64_120_0.dll' (or one of its dependencies). Try using the full path with constructor syntax.

### FFT- Fast Fourier Transform

#### Describe FFT in brief

#### Where parallelism can be inserted?

#### Analyze the performance in serial and parallel model.

##### serial

In [11]:
def fft_sequential(image):
    """
    Compute the Fast Fourier Transform (FFT) of the image sequentially.

    Parameters:
    - image: input image

    Returns:
    - FFT of the image
    """
    fft_image = np.fft.fft2(image)
    return fft_image

In [12]:
def process_images_in_folder_fft(folder_path):
    """
    Process all images in the given folder by computing FFT sequentially.

    Parameters:
    - folder_path: path to the folder containing images

    Returns:
    - dictionary containing image filenames as keys and time taken for processing as values
    """
    times = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            start_time = time.time()
            fft_image = fft_sequential(image)
            end_time = time.time()
            times[filename] = end_time - start_time
    return times

In [13]:
folder_path = "D:/data set/minion/"

In [14]:
start_time_total = time.time()
times_per_image_fft = process_images_in_folder_fft(folder_path)
end_time_total = time.time()
total_time_fft = end_time_total - start_time_total

In [15]:
print("Time taken for FFT on each image:")
for filename, time_taken in times_per_image_fft.items():
    print(f"{filename}: {time_taken:.4f} seconds")

Time taken for FFT on each image:
1.jpg: 0.0221 seconds
10.jpg: 0.0744 seconds
11.jpg: 0.0566 seconds
2.jpg: 0.0465 seconds
3.jpg: 1.5794 seconds
4.jpg: 0.0519 seconds
5.jpg: 0.2540 seconds
6.jpg: 0.0714 seconds
7.jpg: 0.0600 seconds
8.jpg: 0.0937 seconds
9.jpg: 0.0313 seconds


In [16]:
print(f"\nTotal time taken for FFT on all images: {total_time_fft:.4f} seconds")


Total time taken for FFT on all images: 2.5566 seconds


##### Parallel