In [None]:
import os
from math import log, floor
os.chdir("../")

from src.config import Distributed_Affinity_Params, Naive_Algo_Params, Affinity_Params
from src.utils import getMetadataSats, getEdge, rand_index, Graph
from src.algos import DistributedAffinity, NaiveHierarchical

In [None]:
dataset_name = "BANKNOTE"

n_clusters = 10

In [None]:
with open(f"./data/inputs/{dataset_name}-labels.txt") as f:
    truth_clusters = [list(eval(e.strip("\n"))) for e in f.readlines()]
    n_clusters = len(truth_clusters)
f.close()

vertices = []
for cluster in truth_clusters:
    vertices += cluster
vertices = list(set(vertices))

# Distributed Affinity Clustering

In [None]:
params = Distributed_Affinity_Params(
    sc_name=f"Affinity", 
    data_name=f"{dataset_name}-net",
    k=n_clusters,
    eps=0.3
)

In [None]:
with open(f"./data/inputs/{params.data_name}.txt") as f:
    graph = [e.strip("\n") for e in f.readlines()]
f.close()

n_graph, m_graph, c = getMetadataSats(graph)

In [None]:
print(f"Number of vertices : {n_graph}")
print(f"Number of edges : {m_graph}")
print(f"Value of c : {round(c, 3)}")

In [None]:
print(f"Finding the MST of G that runs in at most {floor(log(c/params.eps)) + 1} rounds")
print(f"Number of machines : {int(n_graph**(c-params.eps))}")

In [None]:
dist_affinity = DistributedAffinity(params)
dist_affinity.sc

In [None]:
distributed_clustering = dist_affinity.compute()

In [None]:
print(distributed_clustering.keys())

In [None]:
print(f"The distributed algorithm ran in {round(dist_affinity.computation_time, 1)} seconds on the {params.data_name} dataset")

# Naive approach

In [12]:
params = Naive_Algo_Params(
    data_name=f"{dataset_name}-tab",
    k=n_clusters
)
naive_hierar = NaiveHierarchical(params)
naive_clsutering = naive_hierar.compute()

In [None]:
print(naive_clsutering.keys())

In [None]:
print(f"The naive hierarchical algo ran {round(naive_hierar.computation_time, 1)} seconds on the {params.data_name} dataset")

# Comparaison

In [None]:
r_naive = rand_index(vertices, list(naive_clsutering.values()), truth_clusters)

r_naive

In [None]:
r_distributed = rand_index(vertices, list(distributed_clustering.values()), truth_clusters)

r_distributed