<a href="https://colab.research.google.com/github/ievapociute/big-data-analysis-proj/blob/main/Testing_Parallel_Job_Efficiency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#!pip install psutil memory_profiler


Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [12]:
import time
from multiprocessing import Pool
import psutil
from memory_profiler import memory_usage

In [44]:
def calculate_performance(function):
    """
    Calculates and prints performance metrics:
    execution time, peak memory usage during execution, and CPU utilization change.

    Parameters:
    - function: The function to be profiled.
    """
    start_time = time.time()
    peak_memory_before = psutil.Process().memory_info().rss / (1024 * 1024)
    cpu_percent_before = psutil.cpu_percent(interval=None)

    # Monitor memory usage of the function. Interval measures every 1 second
    mem_usage = memory_usage((function, ), interval=0.1, include_children=True)

    end_time = time.time()
    cpu_percent_after = psutil.cpu_percent(interval=None)

    execution_time = end_time - start_time
    peak_memory_used = max(mem_usage) - peak_memory_before # Additional memory used during function execution
    cpu_percent_change = cpu_percent_after - cpu_percent_before # Change in CPU utilization before and after function execution

    print(f"Execution Time for {function.__name__}: {execution_time:.3f} seconds")
    print(f"Peak Memory Usage for {function.__name__}: {peak_memory_used:.3f} MiB")
    print(f"CPU Utilization Change for {function.__name__}: {cpu_percent_change:.2f}% \n")

    return execution_time

In [40]:
def sum_of_squares(n):
    return sum(x*x for x in range(n))

def sequential_version():
    n = 10**7  # Example workload size
    result = sum_of_squares(n)
    print(f"Result (Sequential): {result}")


def sum_of_squares_range(start, end):
    return sum(x * x for x in range(start, end))

def sum_of_squares_parallel(n, process_count=4):
    with Pool(processes=process_count) as pool:
        part_size = n // process_count
        ranges = [(i * part_size, (i + 1) * part_size) for i in range(process_count)]

        # Directly use sum_of_squares_range with pool.starmap to pass multiple arguments
        results = pool.starmap(sum_of_squares_range, ranges)
        return sum(results)

def parallel_version():
    n = 10**7  # Example workload size
    result = sum_of_squares_parallel(n)
    print(f"Result (Parallel): {result}")


In [45]:
if __name__ == "__main__":
    """
    Evaluates and prints the parallel execution's speedup and efficiency.
    """
    parallel_processors = 4
    sequential_time = calculate_performance(sequential_version)
    parallel_time = calculate_performance(parallel_version)

    # Calculate Speedup and Efficiency
    speedup = sequential_time / parallel_time
    efficiency = speedup / parallel_processors

    print(f"Speedup: {speedup:.3f}") # 1 = parallel and seq are equal; >1 = parallel version is quicker
    print(f"Efficiency: {efficiency:.3f}") # 1 = perfect; <1 = diminishing returns of parallelization

Result (Sequential): 333333283333335000000
Execution Time for sequential_version: 2.132 seconds
Peak Memory Usage for sequential_version: 90.652 MiB
CPU Utilization Change for sequential_version: 74.80% 

Result (Parallel): 333333283333335000000
Execution Time for parallel_version: 1.349 seconds
Peak Memory Usage for parallel_version: 453.043 MiB
CPU Utilization Change for parallel_version: 98.50% 

Speedup: 1.581
Efficiency: 0.395
