In [None]:
import numpy as np
from multiprocessing import Pool
import matplotlib.pyplot as plt
import time

REP_FILE = 'representatives.csv'
ASGNMT_FILE = 'assignments.csv'

def euclidean(a, b):
    return np.linalg.norm(a - b)

def worker(chunk, tau):
    """Process a chunk of data to find local representatives."""
    local_representatives = []
    for x in chunk:
        assigned = False
        for r in local_representatives:
            if euclidean(x, r) < tau:
                assigned = True
                break
        if not assigned:
            local_representatives.append(x)
    return local_representatives


def greedy_clustering(file_path, tau, seed, nr_workers):
    """This function applies the greedy clustering algorithm as indicated in the Project instructions.

    Args:
        file_path (str): file path of the data set
        tau (int): threshold, if all distances between a data point and the already existing representatives is above the threshold, the data point becomes another representative.
        seed (int): seed for shuffling
        nr_workers (int): number of workers/cores used for multiprocessing

    Returns:
        ratio (float): This is the ratio indicating how much smaller the reduced data set is. E.g. 0.1 means the reduced set has 10% of the capacity of the original data set. It is a value between 0 and 1.
    """

    # Read Data
    print("Read data")
    data = np.loadtxt(file_path, delimiter=',')
    print(f"Dataset shape: {data.shape}")
    rng = np.random.default_rng(seed=seed)
    rng.shuffle(data)
    
    ### TODO: Insert code below
    chunks = np.array_split(data, nr_workers)

    with Pool(processes=nr_workers) as pool:
        local_representatives_list = pool.starmap(
            worker,
            [(chunk, tau) for chunk in chunks]
        )

    #Merge local representatives into a global list
    representatives = []
    for local_representatives in local_representatives_list:
        for x in local_representatives:
            assigned = False
            for r in representatives:
                if euclidean(x, r) < tau:
                    assigned = True
                    break
            if not assigned:
                representatives.append(x)

    #assign each data point to the nearest representative
    assignments = []
    for point in data:
        found = False
        for idx, rep in enumerate(representatives):
            if euclidean(point, rep) < tau:
                assignments.append(idx)
                found = True
                break
        if not found:
            #assign to nearest representative
            nearest_rep = np.argmin([euclidean(point, rep) for rep in representatives])
            assignments.append(nearest_rep)

    ### TODO: Insert code above

    ### Save preprocessed data
    ### NOTE: the object 'representatives' holds an iterable (list of lists or NumPy ndarray) with the selected representaives in shape (num_representives, 6).
    ### NOTE: the object 'data_with_labels' holds an iterable (list of lists or NumPy ndarray) with the data points and their assigned cluster (index of representative)
    ###  in shape (num_data_points, 7) where the last column is the index of the representative from the 'representative' object.
    data_with_labels = np.hstack(( data, np.array(assignments).reshape((-1,1))))
    np.savetxt(ASGNMT_FILE, data_with_labels, delimiter=",")
    np.savetxt(REP_FILE, representatives, delimiter=",")
    ratio = len(representatives) / len(data)
    print(f"Ratio of representatives: {ratio:.4f}")  
    return ratio

if __name__ == "__main__":
    # Define parameters for the clustering and plotting
    file_path = 'small_dataset.csv'
    tau_values = [11750, 33.2, 16.351]  #list of threshold values
    seed = 42
    nr_workers = 8

    for tau in tau_values:
        ratio = greedy_clustering(file_path, tau, seed, nr_workers)
        if ratio is not None:
            print(f"For tau={tau}, Ratio of representatives to total data points: {ratio:.4f}")

    #assert (0.08 < greedy_clustering('dataset.csv', 200, 42, 4) < 0.09) 
    #assert (0.08 < greedy_clustering('small_dataset.csv', 200, 42, 4) < 0.09) 
