In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import time
from collections import Counter
from imblearn.over_sampling import SMOTE

In [2]:
feature_file_name = "DNN_features_June20.txt"
label_file_name = "DNN_WeightLevel_Labels_June20.txt"

In [3]:
#PCA dimensionality reduction is applied (weight level labels)
dim_reduction_n = 2
x = np.loadtxt(feature_file_name)
y = np.loadtxt(label_file_name)
dim_reduction_x = PCA(n_components = dim_reduction_n).fit_transform(x)
print("original x shape: ", x.shape, " and PCA output shape: " , dim_reduction_x.shape )
starting = time.time()
model1 = KMeans(n_clusters=3).fit(dim_reduction_x)
after = time.time() - starting;
print("took ", after, " seconds")
print("cluster centers: ",model1.cluster_centers_ )
predicted_labels = model1.predict(dim_reduction_x) # KMEANS AFTER PCA
err = 0
for i in range(dim_reduction_x.shape[0]):
    pred = predicted_labels[i]
    real = int(y[i])
    print("predicted: ", pred, " and real: ", real)
    if(pred != real):
        err += 1
print("you have ", err, " errors from", dim_reduction_x.shape[0]," samples")
print("accuracy is ", 1-err/dim_reduction_x.shape[0], " for K-means after PCA")
np.savetxt("kmeans_predicted_labels_PCA",predicted_labels)

original x shape:  (213, 16)  and PCA output shape:  (213, 2)
took  0.05015993118286133  seconds
cluster centers:  [[-470.58065242   36.9777513 ]
 [ 714.91810142  114.24168267]
 [ 186.38037519  -91.73999178]]
predicted:  0  and real:  0
predicted:  2  and real:  1
predicted:  2  and real:  2
predicted:  0  and real:  0
predicted:  0  and real:  0
predicted:  0  and real:  1
predicted:  2  and real:  1
predicted:  2  and real:  2
predicted:  0  and real:  0
predicted:  0  and real:  0
predicted:  2  and real:  1
predicted:  1  and real:  2
predicted:  2  and real:  2
predicted:  0  and real:  0
predicted:  0  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  1
predicted:  2  and real:  2
predicted:  0  and real:  0
predicted:  0  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  1
predicted:  0  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  1
predicted:  2  and real:  1
predicted:  0  and real:  0
predict

In [4]:
# PCA not used, we directly fed all features into the model (weight level labels)
x = np.loadtxt(feature_file_name)
y = np.loadtxt(label_file_name)
starting = time.time()
model2 = KMeans(n_clusters=3).fit(x)
after = time.time() - starting;
print("took ", after, " seconds")
print("cluster centers: ",model2.cluster_centers_ )
predicted_labels = model2.predict(x) # KMEANS DIRECTLY
err = 0
for i in range(x.shape[0]):
    pred = predicted_labels[i]
    real = int(y[i])
    print("predicted: ", pred, " and real: ", real)
    if(pred != real):
        err += 1
print("you have ", err, " errors from", x.shape[0]," samples")
print("accuracy is ", 1-err/x.shape[0], " for K-means directly")
np.savetxt("kmeans_predicted_labels",predicted_labels)

took  0.10083603858947754  seconds
cluster centers:  [[1.45643718e+00 4.39946866e-01 3.58594402e-01 1.47658905e-01
  1.57039009e+00 1.74071478e+02 6.97050352e+02 7.13803308e+01
  5.08963150e-01 1.12398231e+00 2.07538965e-01 9.37130260e-02
  1.09801958e+00 4.46449780e+01 9.56043956e-01 4.33516484e+00]
 [2.62968107e+00 8.75852726e-01 1.14906101e+00 2.39457512e-01
  2.84210571e+00 5.50326074e+02 1.24900800e+03 1.15220831e+02
  6.64242905e-01 1.45419738e+00 3.33122988e-01 4.95461342e-01
  2.00680286e+00 4.56311155e+01 9.64285714e-01 8.32738095e+00]
 [3.44465184e+00 8.58974947e-01 1.05784558e+00 2.54926026e-01
  3.59967447e+00 5.69111000e+02 1.81555921e+03 1.36453689e+02
  6.20318500e-01 1.66318553e+00 3.67238289e-01 7.69526763e-01
  2.62516711e+00 4.52178395e+01 9.21052632e-01 8.94736842e+00]]
predicted:  0  and real:  0
predicted:  1  and real:  1
predicted:  1  and real:  2
predicted:  0  and real:  0
predicted:  0  and real:  0
predicted:  0  and real:  1
predicted:  1  and real:  1
pre

In [7]:
# K-means after UMAP for weight level labels
import umap
x = np.loadtxt(feature_file_name)
y = np.loadtxt(label_file_name)
reducer = umap.UMAP()
x = reducer.fit_transform(x)
print("shape after umap",x.shape)
starting = time.time()
model3 = KMeans(n_clusters=3).fit(x)
after = time.time() - starting;
print("took ", after, " seconds")
print("cluster centers: ",model3.cluster_centers_ )
predicted_labels = model3.predict(x) # KMEANS AFTER UMAP
err = 0
for i in range(x.shape[0]):
    pred = predicted_labels[i]
    real = int(y[i])
    print("predicted: ", pred, " and real: ", real)
    if(pred != real):
        err += 1
print("you have ", err, " errors from", x.shape[0]," samples")
print("accuracy is ", 1-err/x.shape[0], " for K-means after UMAP")
np.savetxt("kmeans_predicted_labels_after_umap",predicted_labels)

shape after umap (213, 2)
took  0.06919360160827637  seconds
cluster centers:  [[ 8.344865    6.873944  ]
 [-0.09989309 11.907524  ]
 [ 6.485651   10.798017  ]]
predicted:  1  and real:  0
predicted:  2  and real:  1
predicted:  2  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  1  and real:  1
predicted:  2  and real:  1
predicted:  2  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  2  and real:  1
predicted:  0  and real:  2
predicted:  2  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  1
predicted:  0  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  0  and real:  0
predicted:  0  and real:  1
predicted:  1  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  0
predicted:  2  and real:  1
predicted:  0  and real:  1
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  2  and real:  0

In [12]:
# apply SMOTE, then KMEANS
x = np.loadtxt(feature_file_name)
y = np.loadtxt(label_file_name)
counter = Counter(y)
print(counter)

oversample = SMOTE()
x,y = oversample.fit_resample(x,y)
print("After oversampling...")
print("X = ", x.shape)
print("y = ", y.shape)
counter = Counter(y)
print(counter)

dim_reduction_n = 2
dim_reduction_x = PCA(n_components = dim_reduction_n).fit_transform(x)
print("original x shape: ", x.shape, " and PCA output shape: " , dim_reduction_x.shape )

starting = time.time()
model4 = KMeans(n_clusters=3).fit(dim_reduction_x)
after = time.time() - starting;
print("took ", after, " seconds")
print("cluster centers: ",model4.cluster_centers_ )
predicted_labels = model4.predict(dim_reduction_x) # KMEANS AFTER SMOTE AND PCA
err = 0
for i in range(dim_reduction_x.shape[0]):
    pred = predicted_labels[i]
    real = int(y[i])
    print("predicted: ", pred, " and real: ", real)
    if(pred != real):
        err += 1
print("you have ", err, " errors from", dim_reduction_x.shape[0]," samples")
print("accuracy is ", 1-err/dim_reduction_x.shape[0], " for K-means after SMOTe and PCA")
np.savetxt("Kmeans_predicted_labels_after_SMOTE_and_PCA",predicted_labels)

Counter({0.0: 89, 1.0: 72, 2.0: 52})
After oversampling...
X =  (267, 16)
y =  (267,)
Counter({0.0: 89, 1.0: 89, 2.0: 89})
original x shape:  (267, 16)  and PCA output shape:  (267, 2)
took  0.0746450424194336  seconds
cluster centers:  [[ 110.18155551  -80.76497822]
 [-530.91140117   45.76396339]
 [ 633.88323662  107.69140029]]
predicted:  1  and real:  0
predicted:  0  and real:  1
predicted:  0  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  1  and real:  1
predicted:  0  and real:  1
predicted:  0  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  0  and real:  1
predicted:  2  and real:  2
predicted:  0  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  0  and real:  0
predicted:  0  and real:  1
predicted:  0  and real:  2
predicted:  1  and real:  0
predicted:  1  and real:  0
predicted:  0  and real:  0
predicted:  0  and real:  1
predicted:  1  and real:  0
predicted:  0  and real: 