In [5]:
import numpy as np
import tensorflow as tf
from sklearn.cluster import DBSCAN, KMeans
from matplotlib import pyplot as plt
from math import inf

In [6]:
print(f' TensorFlow version: {tf.__version__}')
print(f' NumPy versio: {np.__version__}')

 TensorFlow version: 2.10.0
 NumPy versio: 1.23.4


## Algorithms implementation

### Utils

In [114]:
def dbscan(x):
    x_list = x.tolist()
    clustering = DBSCAN(eps=2).fit(x.reshape(-1, 1))
    x_labels = clustering.labels_.tolist()
    clusters = [set() for _ in range(max(x_labels)+1)]
    print(clusters)
    print(x_labels)
    noice = set()
    for i in range(len(x_list)):
        if x_labels[i] == -1:
            noice.add(x_list[i])
        else:
            clusters[x_labels[i]].add(x_list[i])
    return clusters, noice

### Stage 1 - Flow clustering

In [115]:
def subset_of_sets(_set: set, sets):
    return len([_ for s in sets if _set.issubset(s)]) > 0


def tp_cluster(clusters: set, noise: set, tp_ratio: float, tp_deviation: float):
    """
        Parameters
        clusters : set
            Set of DBSCAN cluster sets in descending throughput order
        noise : set
            Set of DBSCAN noice flows
        tp_ratio : float
            Ratio used to determine if two DBSCAN clusters can be combined into one TPCluster
        tp_deviation : float
            The relative distance a noise flow can be away from a TPCluster to be assigned to that cluster

        Returns
        cs : set
            set of TPClusters
    """

    r = 0
    cs = set()
    for cluster in clusters:
        if not subset_of_sets(cluster, cs):
            cs.add(cluster)
            m = max(cluster)
            for cluster_k in clusters:
                if cluster_k is cluster: continue
                m_prim = max(cluster_k)
                if (1 - tp_ratio) * m < m_prim < m:
                    cs[r] += cluster_k
            r += 1
    for n_j in noise:
        delta_min = inf
        a = None
        for i in range(len(cs)):
            m = max(cs[i])
            if (-tp_deviation * m) <= (m - n_j) <= delta_min:
                delta_min = m - n_j
                a = i
        if a:
            cs[a] += n_j
        else:
            cs[0] += n_j
    return cs


### Stage 2 - FOF computation

In [149]:
def compute_fof(cs):
    """
        Parameters
        clusters : set
            Set of TPCluster sets

        Returns:
            FOF score for each flow in each cluster
    """
    f = [[_ for _ in c_i] for c_i in range(len(cs))]
    k = len(cs)
    for i in range(k):
        c_np = np.array(cs[i])
        s_labels = KMeans(n_clusters=k).fit(c_np).labels_
        s_count = max(s_labels) + 1
        c_prim = 0
        for s_i in range(s_count):
            c_prim = max(c_prim, np.sum(c_np * (s_labels == s_i)) / np.sum(s_labels == s_i))
        for j in range(len(cs[i])):
            f[i][j] = np.abs(cs[i][j] - c_prim) / np.abs(c_prim)
    return f


## Testing

In [124]:
X1 = np.random.normal(10, 1, 10)
X2 = np.random.normal(20, 1, 10)
X3 = np.random.normal(30, 1, 10)
X = np.concatenate((X1, X2, X3), axis=0)
print(dbscan(X))
# X_list = X.tolist()
# X = X.reshape(-1, 1)
# clustering = DBSCAN(eps=2).fit(X)
# X_labels = clustering.labels_.tolist()
# print(X_labels)
# print(X_list)


[set(), set(), set()]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
([{8.88207201684526, 9.146816162940052, 10.084031945135939, 10.534275625039351, 9.801052694831732, 12.222264593884878, 10.5973661896395, 8.960964647171712, 9.262700356650672, 9.194451476084557}, {18.681823377792018, 19.993007689387323, 20.855915864460226, 19.967380782270233, 20.81909999115837, 21.61455636981925, 19.733090112459152, 21.808516765189367, 21.040357994853988, 20.194217769192708}, {28.940294082493203, 29.272453610623142, 29.027821176550827, 29.168874210128074, 29.98156195250162, 30.400773198174765, 31.702914621691463, 29.929637588582587, 30.685643257600322, 31.271522814556164}], set())


In [148]:
X = np.random.normal(10, 1, 10)
k_means = KMeans(n_clusters=4).fit(X.reshape(-1, 1)).labels_
print(np.sum(X * (k_means == 0)) / np.sum(k_means == 0))

10.4463600567482
