In [5]:
import numpy as np
import tensorflow as tf
from sklearn.cluster import DBSCAN, KMeans
from matplotlib import pyplot as plt
from math import inf

In [6]:
print(f' TensorFlow version: {tf.__version__}')
print(f' NumPy versio: {np.__version__}')

 TensorFlow version: 2.10.0
 NumPy versio: 1.23.4


## Algorithms implementation

### Utils

In [112]:
def dbscan(x):
    x_list = x.tolist()
    clustering = DBSCAN(eps=2).fit(x.reshape(-1, 1))
    x_labels = clustering.labels_.tolist()
    clusters = [[] for _ in range(max(x_labels)+1)]
    print(clusters)
    print(x_labels)
    noice = []
    for i in range(len(x_list)):
        if x_labels[i] == -1:
            noice.append(x_list[i])
        else:
            clusters[x_labels[i]].append(x_list[i])
    return clusters, noice

### Stage 1 - Flow clustering

In [109]:
def subset_of_sets(_set: set, sets):
    return len([_ for s in sets if _set.issubset(s)]) > 0


def tp_cluster(clusters: set, noise: set, tp_ratio: float, tp_deviation: float):
    """
        Parameters
        clusters : set
            Set of DBSCAN cluster sets in descending throughput order
        noise : set
            Set of DBSCAN noice flows
        tp_ratio : float
            Ratio used to determine if two DBSCAN clusters can be combined into one TPCluster
        tp_deviation : float
            The relative distance a noise flow can be away from a TPCluster to be assigned to that cluster

        Returns
        cs : set
            set of TPClusters
    """

    r = 0
    cs = []
    for cluster in clusters:
        if not subset_of_sets(cluster, cs):
            cs.append(cluster)
            m = max(cluster)
            for cluster_k in clusters:
                if cluster_k is cluster: continue
                m_prim = max(cluster_k)
                if (1 - tp_ratio) * m < m_prim < m:
                    cs[r] += cluster_k
            r += 1
    for n_j in noise:
        delta_min = inf
        a = None
        for i in range(len(cs)):
            m = max(cs[i])
            if (-tp_deviation * m) <= (m - n_j) <= delta_min:
                delta_min = m - n_j
                a = i
        if a:
            cs[a] += n_j
        else:
            cs[0] += n_j
    return cs


### Stage 2 - FOF computation

In [110]:
def compute_fof(clusters):
    """
        Parameters
        clusters : set
            Set of TPCluster sets

        Returns:
            FOF score for each flow in each cluster
    """
    k = len(clusters)
    for c_i in clusters:
        s = KMeans(n_clusters=k).fit(np.array(c_i))
        c_i_prim = max()

## Testing

In [113]:
X1 = np.random.normal(10, 1, 10)
X2 = np.random.normal(20, 1, 10)
X3 = np.random.normal(30, 1, 10)
X = np.concatenate((X1, X2, X3), axis=0)
print(dbscan(X))
# X_list = X.tolist()
# X = X.reshape(-1, 1)
# clustering = DBSCAN(eps=2).fit(X)
# X_labels = clustering.labels_.tolist()
# print(X_labels)
# print(X_list)

[[], [], []]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
([[9.734024296675189, 10.030642713019452, 11.846400303539433, 9.131991272465905, 11.080310207003981, 12.269035371173128, 10.806924453755322, 8.61498353397189, 10.235552246670562, 8.580138089749353], [19.67926213444093, 20.9429052313879, 19.223165510964897, 17.75116355107008, 19.65613929608209, 20.081618773816892, 18.806581854896645, 21.88592524428429, 21.004368486680935, 20.327905747263316], [32.0373963062916, 28.897773655470274, 30.640691183459793, 28.63815949129994, 30.38541875010265, 30.373793978207246, 31.730555522824222, 32.341630293267464, 31.776490032379677, 28.93561965653828]], [])
