In [None]:
import numpy as np
import random
from model import Model
from clustering import adaptive_spectral_partition
import seaborn as sns
from copy import deepcopy
from typing import Sequence, List, Set
from sklearn.cluster import SpectralClustering
import math
from random import sample
from scipy.io import savemat, loadmat
from itertools import combinations

In [None]:
def vi(predicted_labels: Sequence[int],
       true_labels: Sequence[int]) -> float:
    def expand(labels: Sequence[int]) -> List[Set[int]]:
        expanded = {label: set() for label in labels}
        for node, label in enumerate(labels):
            expanded[label].add(node)
        return list(expanded.values())

    n = len(predicted_labels)
    predicted = expand(predicted_labels)
    p = list(map(lambda x: len(x) / n, predicted))
    true = expand(true_labels)
    q = list(map(lambda x: len(x) / n, true))
    r = [[len(predicted[i].intersection(true[j])) / n
          for j in range(len(true))]
         for i in range(len(predicted))]
    vi = sum([r[i][j] * (np.log(r[i][j] / p[i]) + np.log(r[i][j] / q[j]))
              if r[i][j] > 0 else 0
              for j in range(len(true))
              for i in range(len(predicted))]) * -1
    return abs(vi)


In [None]:
def label_to_cluster(labels):
    num_clusters = len(set(labels))
    v_hat = [set() for _ in range(num_clusters)]
    for i, label in enumerate(labels):
        v_hat[label].add(i)
    return v_hat


In [None]:
def clustering_to_labeling(clusters):
    labels = np.zeros(sum(map(len, clusters)), dtype=int)
    for k, cluster in enumerate(clusters):
        for v in cluster:
            labels[v] = k
    return labels

In [None]:
from itertools import product


def edge_error_rate(predicted_v, true_v):
    n = sum(map(len, true_v))
    A_hat = np.zeros((n, n), dtype=int)
    A = np.zeros((n, n), dtype=int)
    for cluster in map(list, predicted_v):
        for i, j in product(cluster, cluster):
            A_hat[i, j] = 1
    for cluster in map(list, true_v):
        for i, j in product(cluster, cluster):
            A[i, j] = 1
    return (A_hat != A).sum() / (n * n)

In [None]:
p_conf = loadmat('../../mat/allSportsDataForMatlab.mat')['Pconf']

In [None]:
labels_true = loadmat('../../mat/all_sports_ground_truth.mat')['all_sports_ground_truth'][0] - 1

In [None]:
n = len(p_conf)
num_nodes_kernel = int(np.ceil(n / (5 * np.log(n))))

v_hats = []
v_hats_removed = []
overflow_nodes = []

T_s = []
vi_s = []
vi_no_over_s = []
edge_err_s = []
edge_err_no_over_s = []
K_pred_s = []
T_actual = []
p_hats = []
q_hats = []
mean_T = []

random.seed(0)
np.random.seed(0)
alpha = [1 / 1 for _ in range(2)]
model = Model(2, n, alpha)
model.pair_prob_mat = p_conf
T = 60000

for it in range(10):
    v_hat, overflow_nodes, T_remained, p_hat, q_hat, observed_times = adaptive_spectral_partition(T, num_nodes_kernel, model)

    v_hat_overflow_removed = deepcopy(v_hat)
    for v in overflow_nodes:
        for cluster in v_hat_overflow_removed:
            if v in cluster:
                cluster.remove(v)

    for v in overflow_nodes:
        v_hat_overflow_removed[np.random.randint(len(v_hat))].add(v)

    labels_hat = clustering_to_labeling(v_hat)
    labels_hat_overflow_removed = clustering_to_labeling(v_hat_overflow_removed)

    voi = vi(labels_hat, labels_true)
    voi_overflow_removed = vi(labels_hat_overflow_removed, labels_true)

    v_true = label_to_cluster(labels_true)
    edge_err = edge_error_rate(v_hat, v_true)
    edge_err_overflow_removed = edge_error_rate(v_hat_overflow_removed, v_true)

    v_hats.append(list(map(list, v_hat)))
    v_hats_removed.append(list(map(list, v_hat_overflow_removed)))
    overflow_nodes.append(overflow_nodes)

    K_predicted = len(v_hat)

    T_s.append(T)
    vi_s.append(voi)
    p_hats.append(p_hat)
    q_hats.append(q_hat)
    vi_no_over_s.append(voi_overflow_removed)
    edge_err_s.append(edge_err)
    edge_err_no_over_s.append(edge_err_overflow_removed)
    K_pred_s.append(K_predicted)
    T_actual.append(np.abs(T_remained) + T)
    mean_T.append(np.mean(list(observed_times.values())))

In [None]:
import pandas as pd
import json

meta_data = {
    'T': T_s,
    'v_hats': v_hats,
    'v_hats_removed': v_hats_removed,
    'overflow_nodes': overflow_nodes
}

df = pd.DataFrame({
    'T': T_s,
    'VI': vi_s,
    'VI_r': vi_no_over_s,
    'p_hat': p_hats,
    'q_hat': q_hats,
    'edge_error_rate': edge_err_s,
    'edge_error_rate_r': edge_err_no_over_s,
    'K_predicted': K_pred_s,
    'T_actual': T_actual,
    'mean_T': mean_T
})