<a href="https://colab.research.google.com/github/klimaviu/big-data-analysis-2024/blob/main/bigdata_1_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Comparing parallelization techniques with sequential image processing

Disclaimer: the code was generated by iteratively consulting ChatGPT and making adjustments.

In [None]:
import os
import cv2
import numpy as np
import zipfile
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time
from google.colab import drive
drive.mount('/content/drive')

zip_path = '/content/drive/My Drive/data_set_VU_test1.zip'
output_directory = '/content/drive/My Drive/processed_images'

executor = ThreadPoolExecutor(max_workers=10)

def save_image(image_data, filename):
    cv2.imwrite(os.path.join(output_directory, filename), image_data)

# Define individual processing functions
def convert_to_bw(image_data, filename):
    _, bw_image = cv2.threshold(image_data, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    save_image(bw_image, f"{filename}_bw")
    return bw_image

def apply_blur(image_data, filename):
    blurred_image = cv2.GaussianBlur(image_data, (5, 5), 0)
    save_image(blurred_image, f"{filename}_blur")
    return blurred_image

def add_noise(image_data, filename):
    black_pixels = np.sum(image_data == 0)
    num_noise_pixels = int(0.1 * black_pixels)
    h, w = image_data.shape
    noise_x = np.random.randint(0, w, num_noise_pixels)
    noise_y = np.random.randint(0, h, num_noise_pixels)
    noisy_image = np.copy(image_data)
    noisy_image[noise_y, noise_x] = 255 - noisy_image[noise_y, noise_x]
    save_image(noisy_image, f"{filename}_noise")
    return noisy_image

# Load images from ZIP file
def load_images(zip_path):
    images_data = []
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        jpg_files = [f for f in zip_ref.namelist() if f.endswith('.jpg') and 'Images/' in f]
        for file_name in jpg_files:
            with zip_ref.open(file_name) as image_file:
                image_data = image_file.read()
                image_array = np.frombuffer(image_data, np.uint8)
                image = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)
                images_data.append((image, os.path.basename(file_name)))
    return images_data

# Function to process a single image for all tasks
def process_image_for_all_tasks(img_data):
    img, filename = img_data
    for task_func in [convert_to_bw, apply_blur, add_noise]:
        img = task_func(img, filename)

# Function to process all images sequentially
def process_images_sequential(images_data):
    start_time = time.time()
    for img_data in images_data:
        process_image_for_all_tasks(img_data)
    return time.time() - start_time

def process_all_images_for_all_tasks_parallel(images_data, b=50):
    start_time = time.time()
    batch_size = b
    processed_images = []

    # Split the images into batches
    image_batches = [images_data[i:i + batch_size] for i in range(0, len(images_data), batch_size)]

    # Process images in batches of size batch_size in parallel
    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in image_batches:
            for task_func in [convert_to_bw, apply_blur, add_noise]:
                futures.append(executor.submit(process_batch, batch, task_func))

        # Wait for all futures to complete
        for future in futures:
            processed_images.extend(future.result())

    return time.time() - start_time

def process_batch(batch, task_func):
    return [task_func(image_data, filename) for image_data, filename in batch]

# Function to process images for a specific task in parallel using global executor and map
def process_images_for_task_parallel(task_func, images_data):
    start_time = time.time()
    processed_images = list(executor.map(lambda x: task_func(x[0], x[1]), images_data))
    return time.time() - start_time

def process_images_for_all_tasks_parallel(images_data):
    start_time = time.time()
    # Create a list of tuples for each image and task function
    image_task_pairs = [(image_data, task_func, filename) for image_data, filename in images_data for task_func in [convert_to_bw, apply_blur, add_noise]]
    # Process all images for all tasks in parallel
    list(executor.map(lambda x: x[1](x[0], x[2]), image_task_pairs))
    return time.time() - start_time

# Evaluate performance
def evaluate_performance(zip_path):
    images_data = load_images(zip_path)

    # Sequential processing
    time_sequential = process_images_sequential(images_data)

    # Parallel processing (task-based)
    time_tasks_parallel = process_images_for_all_tasks_parallel(images_data)

    # Parallel processing (file-based)
    time_files_parallel = process_all_images_for_all_tasks_parallel(images_data)

    print(f"Sequential: {time_sequential:.2f}s, Task-based Parallel: {time_tasks_parallel:.2f}s, File-based Parallel: {time_files_parallel:.2f}s")

evaluate_performance(zip_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Selecting the optimal batch size for file-based parallelization

In [None]:
def cross_validate_batch_size(images_data, batch_sizes, n_fold=5):

    performance_metrics = []
    for b in batch_sizes:
        total_time = 0
        for _ in range(n_fold):
            start_time = time.time()
            process_all_images_for_all_tasks_parallel(images_data, b)
            total_time += time.time() - start_time
        avg_time = total_time / 3
        performance_metrics.append(avg_time)

    return performance_metrics

images_data = load_images(zip_path)
batch_sizes = [1, 5, 10, 50, 75, 100, 125, 150]

time_based_on_batch_size = cross_validate_batch_size(images_data, batch_sizes)

In [None]:
import pandas as pd
import seaborn.objects as so

time_vs_batch_size = pd.DataFrame({
    "batch_size": batch_sizes,
    "avg_time": time_based_on_batch_size
})

time_vs_batch_size

so.Plot(time_vs_batch_size, "batch_size", "avg_time")\
  .add(so.Line(marker="o", edgecolor="w"), linestyle=None)\
  .label(
      title = "Batch size vs. average time"
  )
