In [1]:
import matplotlib.pyplot as plt
#from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import time

In [2]:
kmeans = KMeans(
    init="random",
    n_clusters=3,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [3]:
X = pd.read_csv('truncatedHashtx.1.csv')
X = X.head(50000)
X.drop(["No."], axis=1, inplace=True)

In [4]:
# List of the column names with nominal features that should be one-hot encoded
onehot_features = ['Source', 'Destination', 'Protocol', 'Info']

# One-hot encode these features 
onehot = sklearn.preprocessing.OneHotEncoder(sparse=False) # we want a non-sparse matrix for concatenation
encoded = onehot.fit_transform(X[onehot_features])
encoded = pd.DataFrame(encoded) # the output of the encoding is a NumPy array, but we want it as a Pandas DataFrame

# One-hot encoding produces an output with more columns than the input, so we can't just reassign the output to X. 
# There are a few ways to put the encoded features in the original dataframe, but none of them are that elegant, so we'll just drop the old columns and concatenate the new
X.drop(onehot_features, axis=1, inplace=True)
X = pd.concat([X, encoded], axis=1)

#Scale features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [7]:
X.shape

(50000, 23934)

In [None]:
# This decides how many clusters we should use based on the SSE graph and the elbow locator
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

In [None]:
 plt.style.use("fivethirtyeight")
 plt.plot(range(1, 4), sse)
 plt.xticks(range(1, 4))
 plt.xlabel("Number of Clusters")
 plt.ylabel("SSE")
 plt.show()

In [8]:
kl = KneeLocator(
    range(1, 11), sse, curve="convex", direction="decreasing"
)

kl.elbow

NameError: name 'KneeLocator' is not defined

In [None]:
start_time = time.time()
pca = PCA(2)
X1_pca = pca.fit_transform(X)

In [None]:
kmeans = KMeans(init="random", n_clusters=3, n_init=10, max_iter=300,random_state=42).fit(X1_pca)
label = kmeans.predict(X1_pca)
print(time.time() - start_time)

In [None]:
#filter rows of original data
filtered_label0 = X1_pca[label == 2]
filtered_label1 = X1_pca[label == 5]
filtered_label2 = X1_pca[label == 6]
print(len(filtered_label0))
print(len(filtered_label1))
print(len(filtered_label2))

plt.title("Colgate's Network Data KMeans and PCA")
                     
#plotting the results
plt.scatter(filtered_label0[:,0] , filtered_label0[:,1] , color = 'red')
plt.scatter(filtered_label1[:,0] , filtered_label1[:,1] , color = 'black')
plt.scatter(filtered_label2[:,0] , filtered_label2[:,1] , color = 'blue')
plt.grid(True)

plt.show()

In [None]:
start_time = time.time()
tsne = TSNE(2)
X2_tsne = tsne.fit_transform(X)

In [None]:
kmeans = KMeans(init="random", n_clusters=3, n_init=10, max_iter=300,random_state=42).fit(X2_tsne)
label = kmeans.predict(X2_tsne)
print(time.time() - start_time)

In [None]:
#filter rows of original data
filtered_label0 = X2_tsne[label1 == 2]
filtered_label1 = X2_tsne[label1 == 5]
filtered_label2 = X2_tsne[label1 == 6]
print(len(filtered_label0))
print(len(filtered_label1))
print(len(filtered_label2))

plt.title("Colgate's Network Data KMeans and t-SNE")
                     
#plotting the results
plt.scatter(filtered_label0[:,0] , filtered_label0[:,1] , color = 'black')
plt.scatter(filtered_label1[:,0] , filtered_label1[:,1] , color = 'red')
plt.scatter(filtered_label2[:,0] , filtered_label2[:,1] , color = 'blue')
plt.grid(True)

plt.show()