# Lab 4: Divide and Conquer Algorithms

> Date: 11/08/2024
> Author: Adam Haile, Leigh Goetsch, Aiden Miller
> Class: CSC3310
>
> ### Learning Outcomes
>
> - Design an algorithm for a given computational problem statement
> - Justify the correctness of an algorithm
> - Perform asymptotic complexity analysis of the run time of an algorithm
> - Generate test cases for an algorithm
> - Correctly implement an algorithm from pseudocode
> - Design and execute benchmarks for an algorithm



## Proposed Solution - Quickselect Algorithm
The quickselect algorithm is similar to the quicksort algorith, but with modifications so that it instead returns the $k^{th}$ smallest value in lstay, rather than a sorted lstay.

In [None]:
import random

In [None]:
def partition(lst, low, high):
    """
    Partition the list lst[low:high+1] around a randomly chosen pivot element.
    """
    pivot_index = random.randint(low, high)
    lst[pivot_index], lst[high] = lst[high], lst[pivot_index]
    pivot = lst[high]
    
    i = low
    for j in range(low, high):
        if lst[j] <= pivot:
            lst[i], lst[j] = lst[j], lst[i]
            i += 1
    lst[i], lst[high] = lst[high], lst[i]
    return i

def select(lst, low, high, k_smallest):
    """
    Find the k_smallest-th element in the list lst[low:high+1].
    """
    if low == high:
        return lst[low]
    
    pivot_index = partition(lst, low, high)
    
    if k_smallest == pivot_index:
        return lst[k_smallest]
    elif k_smallest < pivot_index:
        return select(lst, low, pivot_index - 1, k_smallest)
    else:
        return select(lst, pivot_index + 1, high, k_smallest)

def quickselect(lst, k):
    """
    Find the k-th smallest element in an unsorted list using Quickselect.
    k is 1-based (i.e., k=1 returns the smallest element).
    """
    if k < 1 or k > len(lst):
        raise ValueError("k is out of bounds")
    # Convert k to 0-based index for internal use
    return select(lst, 0, len(lst) - 1, k - 1)

## Tests


In [None]:
def create_random_list(n, min=1, max=100):
    random.seed(42)
    return random.sample(range(min, max), n)
lst = create_random_list(10)
sorted_lst = sorted(lst)
print(lst)
print(sorted_lst)

In [None]:
# Test one (sunny path): Ensure that the kth smallest element is correct
for x in range(1, 11):
    assert quickselect(lst, x) == sorted_lst[x - 1]

In [None]:
# Test 2, excessive duplicate elements
lst = [1, 1, 1, 1, 1, 2, 1, 1, 1, 1] #9 1's and 1 2
for x in range(1, 10):
    assert quickselect(lst, x) == 1
assert quickselect(lst, 10) == 2

In [None]:
# Test 3, wildly varying elements
lst1 = create_random_list(10, 1, 1000)
lst2 = create_random_list(10, 1, 1000000000000000000) #19 zeroes causes a C ssize_t error
lst = lst1 + lst2
sorted_lst = sorted(lst)
for x in range(1, 11):
    assert quickselect(lst, x) == sorted_lst[x - 1]
print(sorted_lst)

In [None]:
# Test 4, random shuffling
lst = create_random_list(10)
sorted_lst = sorted(lst)
for x in range(0, 10):
    random.shuffle(lst)
    for x in range(1, 11):
        assert quickselect(lst, x) == sorted_lst[x - 1]

In [None]:
# Test 5, 2 elements
lst = [2, 1]
assert quickselect(lst, 1) == 1
assert quickselect(lst, 2) == 2

In [None]:
# Test 6, 10000 items
lst = create_random_list(10000, min=1, max=100000)
sorted_lst = sorted(lst)
for x in range(1, 10001):
    assert quickselect(lst, x) == sorted_lst[x - 1]

- Test 1: Test 10 elements to ensure that the kth smallest element corresponds to the [k-1]th index of a sorted list, Result: Pass
- Test 2: Test to ensure that lots of duplicate elements are handled correctly , Result: Pass
- Test 3: Wildly small and large values, Result: Pass
- Test 4: Shuffle list and check, ensuring that there isn't a dependence on ordering, Result: Pass
- Test 5: 2 element list, Result: Pass
- Test 6: 10000 element list, Result: Pass

## Benchmarks

Benchmark your implementation versus an approach that sorts the numbers and picks
the element at index k – 1. You should include a table and graph from benchmarking
different lists with different sizes of numbers. The benchmarks should support your
theoretically-derived run time and provide evidence that the run time of your algorithm
grows more slowly than the sorting approach.

In [None]:
import time
import random
import numpy as np


def sortselect(list, k):
    return sorted(list)[k-1]


def benchmark(algorithm, input_list, k):
    '''
    Function takes a reference to a sorting function and 
    the list to sort. It returns the elapsed time in seconds.
    '''
    # DO ANY SETUP
    input_list = input_list.copy()
    start_time = time.perf_counter()
    result = algorithm(input_list, k)
    end_time = time.perf_counter()
    elapsed = end_time - start_time
    return elapsed


# Benchmarks
# set up benchmarking lists

# sizes of lists to benchmark
list_sizes = [100, 500, 1000, 5000, 10000,
              25000, 50000, 75000, 100000, 250000, 500000]

# list cases to benchmark
random.seed(42)
benchmark_lists = {
    "best" : [list(range(n)) for n in list_sizes],
    "worst" : [list(range(n, 0, -1)) for n in list_sizes],
    "random" : [random.sample(range(n), n) for n in list_sizes]
}


benchmark_results = {
    quickselect: [],
    sortselect: []
}

# Run benchmarks
for algo in benchmark_results.keys():
    for case, lists in benchmark_lists.items():
        for lst in lists:
            n = len(lst)
            print(f"Running {algo.__name__} with size {n} - case {case}")
            k_values = range(n // 100, n, n // 100)
            for k in k_values:
                # Run multiple trials and take the mean time
                times = [benchmark(algo, lst, k) for _ in range(10)]
                avg_time = np.mean(times)

                k_fraction = k*100//n
                if k_fraction % 20 == 0:
                    print(f"  k={k_fraction} took {avg_time:.6f} seconds")

                # Store results
                benchmark_results[algo].append({
                    'case': case,
                    'size': n,
                    'k_value': k_fraction,
                    'time': avg_time
                })

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import linregress

# fit a linear regression model to the log of the list
# sizes (s) and run times (r) to estimate the slope (m)
# log r = m log s + b


def estimate_slope(list_sizes, run_times):
    '''
    Function takes a list of list sizes and a list of run times
    and returns the slope of the linear regression model.
    '''
    m, b, _, _, _ = linregress(np.log(list_sizes), np.log(run_times))
    return m


def get_complexity(m):
    '''
    Function takes the slope of the linear regression model
    and returns the complexity of the algorithm.
    '''
    if m == 0:
        return "Constant"
    elif m < 1:
        return "Sub-linear (e.g., log n)"
    elif m == 1:
        return "Linear"
    elif m > 1 and m < 2:
        return "Between linear and quadratic (e.g., n log n)"
    elif m == 2:
        return "Quadratic (e.g., n^2)"
    elif m > 2 and m < 3:
        return "Between quadratic and cubic (e.g., n^2 log n)"
    elif m == 3:
        return "Cubic (e.g., n^3)"
    else:
        return "Out of Scope"


print("complexity by k value:")
for algo, results in benchmark_results.items():
    print(f"Algorithm: {algo.__name__}")
    for case in benchmark_lists.keys():
        df = pd.DataFrame(results)
        df = df[df['case'] == case]
        for k in df['k_value'].unique():
            if k % 10 != 0:
                continue
            df_k = df[df['k_value'] == k]
            m = estimate_slope(df_k['size'], df_k['time'])
            complexity = get_complexity(m)
            print(f"  Case: {case}, k={k}%: {complexity} (m={m:.2f})")

print()
print("complexity by case:")
for algo, results in benchmark_results.items():
    print(f"Algorithm: {algo.__name__}")
    for case in benchmark_lists.keys():
        df = pd.DataFrame(results)
        df = df[df['case'] == case]
        m = estimate_slope(df['size'], df['time'])
        complexity = get_complexity(m)
        print(f"  Case: {case}: {complexity} (m={m:.2f})")

print()
print("complexity by list size:")
for algo, results in benchmark_results.items():
    print(f"Algorithm: {algo.__name__}")
    for n in list_sizes:
        if n in [100, 1000, 10000, 100000]:
            df = pd.DataFrame(results)
            df = df[df['size'] == n]
            m = estimate_slope(df['size'], df['time'])
            complexity = get_complexity(m)
            print(f"  Size: {n}: {complexity} (m={m:.2f})")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

# Define markers and colors for algorithms
markers = {
    quickselect: 'o',
    sortselect: 's'
}
colors = {
    quickselect: 'blue',
    sortselect: 'orange'
}

k_values_to_plot = range(20, 100, 15)


# Create the plot
plt.figure(figsize=(10, 6))
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xscale("log")
plt.xlabel("List Size", fontsize=12)
plt.ylabel("Run Time (s)", fontsize=12)
plt.title("Comparison of Quickselect vs Sortselect", fontsize=16)


for algo, results in benchmark_results.items():
    results = pd.DataFrame(results)
    # average the results for each list size
    size_results = results.groupby('size').mean().reset_index()
    # print(size_results)
    plt.plot(size_results['size'], size_results['time'], label=f"{algo.__name__})",
                marker=markers[algo], linestyle='-', linewidth=1.5)

# Add legend
plt.legend(title="Algorithm", loc="upper left")
plt.ylim(bottom=0)
plt.show()


# Create the plot
plt.figure(figsize=(10, 6))
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xlabel("k Value", fontsize=12)
plt.ylabel("Run Time (s)", fontsize=12)
plt.title("Comparison of Quickselect vs Sortselect", fontsize=16)


for algo, results in benchmark_results.items():
    results = pd.DataFrame(results)
    # average the results for each list size
    size_results = results.groupby('k_value').mean().reset_index()
    plt.plot(size_results['k_value'], size_results['time'], label=f"{algo.__name__}",
                marker=markers[algo],  linestyle='-', linewidth=1.5)

# Add legend
plt.legend(title="Algorithm", loc="upper left")
plt.ylim(bottom=0)
plt.tight_layout()
plt.show()