# FlowClus – Clean Clustering Notebook for Colab

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("/content/netflow_sampling_250_50-50_train.csv")
df['duration'] = df['last'] - df['first']
df['bytes_per_pkt'] = df['doctets'] / (df['dpkts'] + 1)

features = ['dpkts', 'doctets', 'srcport', 'dstport', 'prot', 'tcp_flags', 'duration', 'bytes_per_pkt']
X = df[features].fillna(0)
X_scaled = StandardScaler().fit_transform(X)


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertias = []
K = range(2, 10)
for k in K:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_scaled)
    inertias.append(model.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K, inertias, 'o-')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia (WCSS)")
plt.title("Elbow Method for KMeans")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, pairwise_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

def dunn_index(X, labels):
    unique = np.unique(labels)
    unique = unique[unique != -1]
    if len(unique) < 2: return np.nan
    intra, inter = [], []
    for i in unique:
        xi = X[labels == i]
        if len(xi) > 1:
            intra.append(np.max(cdist(xi, xi)))
        for j in unique:
            if i < j:
                xj = X[labels == j]
                inter.append(np.min(cdist(xi, xj)))
    return np.min(inter) / np.max(intra)

def beta_cv(X, labels):
    D = pairwise_distances(X)
    intra, inter = [], []
    for i in np.unique(labels):
        if i == -1: continue
        idx = np.where(labels == i)[0]
        if len(idx) > 1:
            intra_d = D[np.ix_(idx, idx)]
            intra.append(np.mean(intra_d[np.triu_indices_from(intra_d, k=1)]))
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if labels[i] != labels[j] and labels[i] != -1 and labels[j] != -1:
                inter.append(D[i, j])
    return np.mean(intra) / np.mean(inter)

def evaluate_all(X, labels, name):
    print(f"=== {name} ===")
    if len(np.unique(labels)) <= 1 or np.all(labels == -1):
        print("Not enough clusters.")
        return
    print(f"Silhouette Score        : {silhouette_score(X, labels):.3f}")
    print(f"Davies-Bouldin Index    : {davies_bouldin_score(X, labels):.3f}")
    print(f"Calinski-Harabasz Score : {calinski_harabasz_score(X, labels):.1f}")
    print(f"Dunn Index              : {dunn_index(X, labels):.3f}")
    print(f"BetaCV                  : {beta_cv(X, labels):.3f}")


In [None]:
from sklearn.cluster import KMeans

# KMeans
kmeans = KMeans(n_clusters=4, random_state=42).fit(X_scaled)
df['kmeans'] = kmeans.labels_
evaluate_all(X_scaled, kmeans.labels_, "KMeans (k=4)")

# OPTICS
optics = OPTICS(min_samples=10)
optics_labels = optics.fit_predict(X_scaled)
df['optics'] = optics_labels
evaluate_all(X_scaled[optics_labels != -1], optics_labels[optics_labels != -1], "OPTICS (excluding noise)")

# GMM
gmm = GaussianMixture(n_components=4, random_state=42).fit(X_scaled)
df['gmm'] = gmm.predict(X_scaled)
evaluate_all(X_scaled, df['gmm'], "Gaussian Mixture (k=4)")

# Agglomerative
agglo = AgglomerativeClustering(n_clusters=4)
df['agglo'] = agglo.fit_predict(X_scaled)
evaluate_all(X_scaled, df['agglo'], "Agglomerative Clustering (k=4)")


In [None]:
df.to_csv("netflow_with_clusters_clean.csv", index=False)
print("Saved: netflow_with_clusters_clean.csv")
