In [1]:
import numpy as np
import os

from sklearn import metrics, cluster, preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
from itertools import cycle, islice

# Data Preparation/Normalization


The following cells import the Pulsar dataset to be clustered, transforms the dataset from a dataframe to a numpy array, and normalizes the values using Sklearn's built in RobustScalar and StandardScaler.

In [2]:
 path = "../data"

pulsar_data = pd.read_csv(path + "/input/HTRU2/HTRU_2.csv", 
                          low_memory=False, index_col=False, usecols=[*range(0, 7)], header=0, 
                          names=["mean_IP", "standDev_IP", "excessKurt_IP", "skewness_IP", 
                                 "mean_DMSNR", "standDev_DMSNR", "excessKurt_DMSNR", "skewness_DMSNR"])
ground_truth = pd.read_csv(path + "/input/HTRU2/HTRU_2.csv", 
                           low_memory=False, index_col=False, usecols=[8], header=0, 
                           names=["pulsar"])

print(pulsar_data.head())
print(ground_truth.head())
print(ground_truth.tail())


      mean_IP  standDev_IP  excessKurt_IP  skewness_IP  mean_DMSNR  \
0  102.507812    58.882430       0.465318    -0.515088    1.677258   
1  103.015625    39.341649       0.323328     1.051164    3.121237   
2  136.750000    57.178449      -0.068415    -0.636238    3.642977   
3   88.726562    40.672225       0.600866     1.123492    1.178930   
4   93.570312    46.698114       0.531905     0.416721    1.636288   

   standDev_DMSNR  excessKurt_DMSNR  
0       14.860146         10.576487  
1       21.744669          7.735822  
2       20.959280          6.896499  
3       11.468720         14.269573  
4       14.545074         10.621748  
   pulsar
0       0
1       0
2       0
3       0
4       0
       pulsar
17892       0
17893       0
17894       0
17895       0
17896       0


In [3]:
X = pulsar_data.to_numpy()
print(X) 

[[ 1.02507812e+02  5.88824300e+01  4.65318154e-01 ...  1.67725752e+00
   1.48601457e+01  1.05764867e+01]
 [ 1.03015625e+02  3.93416494e+01  3.23328365e-01 ...  3.12123746e+00
   2.17446687e+01  7.73582202e+00]
 [ 1.36750000e+02  5.71784487e+01 -6.84146380e-02 ...  3.64297659e+00
   2.09592803e+01  6.89649891e+00]
 ...
 [ 1.19335938e+02  5.99359394e+01  1.59363100e-01 ...  2.14306020e+01
   5.88720002e+01  2.49951714e+00]
 [ 1.14507812e+02  5.39023997e+01  2.01161383e-01 ...  1.94648829e+00
   1.33817306e+01  1.00079673e+01]
 [ 5.70625000e+01  8.57973403e+01  1.40639105e+00 ...  1.88306020e+02
   6.47125623e+01 -1.59752658e+00]]


In [4]:
transformer = preprocessing.RobustScaler().fit(X)
transformer_2 = preprocessing.StandardScaler().fit(X)
transformer_3 = preprocessing.MinMaxScaler().fit(X)

X_tr = transformer.transform(X)
X_ts = transformer_2.transform(X)
X_tm = transformer_3.transform(X)
print(X_tr)
print(X_ts)
print(X_tm)

[[-4.80585424e-01  1.38028895e+00  5.42479397e-01 ... -3.17516525e-01
  -2.57299472e-01  4.35359153e-01]
 [-4.61170848e-01 -8.79424110e-01  2.24289145e-01 ...  9.01794145e-02
   2.34774745e-01 -1.41837223e-01]
 [ 8.28554361e-01  1.18323905e+00 -6.53582469e-01 ...  2.37488196e-01
   1.78638771e-01 -3.12379790e-01]
 ...
 [ 1.62783751e-01  1.50211770e+00 -1.43146778e-01 ...  5.25967894e+00
   2.88846658e+00 -1.20580516e+00]
 [-2.18040621e-02  8.04393855e-01 -4.94794355e-02 ... -2.41501417e-01
  -3.62969819e-01  3.19841348e-01]
 [-2.21804062e+00  4.49275285e+00  2.65136500e+00 ...  5.23755902e+01
   3.30592324e+00 -2.03828594e+00]]
[[-0.33410698  1.80237928 -0.01182199 ... -0.37111042 -0.58893062
   0.50440885]
 [-0.31431076 -1.05327065 -0.14526773 ... -0.3221168  -0.2353432
  -0.12599666]
 [ 1.0007676   1.55336292 -0.51343811 ... -0.30441441 -0.27568056
  -0.31226078]
 ...
 [ 0.32190941  1.956337   -0.2993666  ...  0.29911214  1.67150747
  -1.28804701]
 [ 0.13369301  1.07460777 -0.2600834

Clustering

The following cells use KMeans Clustering on the normalized dataset and receives an array of labels for each instance that was clustered. A ground truth matrix is generated from the new data using ground truth values from the original dataset.

In this instance, number of clusters is 8 and max_iter=300 were chosen values for clustering. 

In [5]:
km_raw = cluster.KMeans(n_clusters=2, init='k-means++', max_iter=300).fit(X)
km_r = cluster.KMeans(n_clusters=2, init='k-means++', max_iter=300).fit(X_tr)
km_s = cluster.KMeans(n_clusters=2, init='k-means++', max_iter=300).fit(X_ts)
km_m = cluster.KMeans(n_clusters=2, init='k-means++', max_iter=300).fit(X_tm)
print(np.unique(km_raw.labels_, return_counts = True))
print(np.unique(km_r.labels_, return_counts = True))
print(np.unique(km_s.labels_, return_counts = True))
print(np.unique(km_m.labels_, return_counts = True))

(array([0, 1], dtype=int32), array([16094,  1803]))
(array([0, 1], dtype=int32), array([16444,  1453]))
(array([0, 1], dtype=int32), array([16752,  1145]))
(array([0, 1], dtype=int32), array([ 2328, 15569]))


In [6]:
labels_truth = ground_truth.to_numpy().transpose()[0]
print(np.unique(labels_truth, return_counts=True))
print(np.unique(np.less(km_r.labels_, labels_truth), return_counts=True))
km_r.labels_[km_r.labels_ == 1 ] = -1
km_r.labels_[km_r.labels_ == 0 ] = 1
km_r.labels_[km_r.labels_ == -1 ] = 0
print(np.unique(np.less(km_s.labels_, labels_truth), return_counts=True))
km_s.labels_[km_s.labels_ == 1 ] = -1
km_s.labels_[km_s.labels_ == 0 ] = 1
km_s.labels_[km_s.labels_ == -1 ] = 0
print(np.unique(np.less(km_m.labels_, labels_truth), return_counts=True))
print('KMeans Clustering Metrics:')
labels_pred = km_r.labels_
print('robust_scalar\\')
rand_r = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_r)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_r = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_r)
labels_pred = km_s.labels_
print('standard_scalar\\')
rand_s = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_s)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_s = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_s)
labels_pred = km_m.labels_
print('standard_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))

(array([0, 1]), array([16258,  1639]))
(array([False,  True]), array([17072,   825]))
(array([False,  True]), array([17348,   549]))
(array([False,  True]), array([16632,  1265]))
KMeans Clustering Metrics:
robust_scalar\
rand score =  0.8497717118636621
adjusted mutual info score =  0.2300398292337505
fowlkes-mallow =  0.9108597608201152
standard_scalar\
rand score =  0.9347769555732026
adjusted mutual info score =  0.5839365376469203
fowlkes-mallow =  0.9622992088126792
standard_scalar\
rand score =  0.8523000373492694
adjusted mutual info score =  0.34045961714370293
fowlkes-mallow =  0.9087374879950447


In [7]:
Kmeans_Metrics = pd.DataFrame([["K-means", "z-score", rand_s, fm_s], ["K-means", "robust scalar", rand_r, fm_r]], columns=["Clustering Algorithm", "Normalization Method", 
                                           "Rand Index", "Fowlkes-Mallow"])

Kmeans_Metrics.to_csv(path + "/output/kmeans_metrics.csv")
Kmeans_Metrics

Unnamed: 0,Clustering Algorithm,Normalization Method,Rand Index,Fowlkes-Mallow
0,K-means,z-score,0.934777,0.962299
1,K-means,robust scalar,0.849772,0.91086
