### DBSCAN, Spectral Clustering, and Agglomerative on HTRU2 Pulsar Data

In [1]:
import numpy as np
import os

from sklearn import metrics, cluster, preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from itertools import cycle, islice

import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
path = "../data"

pulsar_data = pd.read_csv(path + "/input/HTRU2/HTRU_2.csv", 
                          low_memory=False, index_col=False, usecols=[*range(0, 8)], header=0, 
                          names=["mean_IP", "standDev_IP", "excessKurt_IP", "skewness_IP", 
                                 "mean_DMSNR", "standDev_DMSNR", "excessKurt_DMSNR", "skewness_DMSNR"])
ground_truth = pd.read_csv(path + "/input/HTRU2/HTRU_2.csv", 
                           low_memory=False, index_col=False, usecols=[8], header=0, 
                           names=["pulsar"])
print(pulsar_data.head())
print(ground_truth.head())
print(ground_truth.tail())

      mean_IP  standDev_IP  excessKurt_IP  skewness_IP  mean_DMSNR  \
0  102.507812    58.882430       0.465318    -0.515088    1.677258   
1  103.015625    39.341649       0.323328     1.051164    3.121237   
2  136.750000    57.178449      -0.068415    -0.636238    3.642977   
3   88.726562    40.672225       0.600866     1.123492    1.178930   
4   93.570312    46.698114       0.531905     0.416721    1.636288   

   standDev_DMSNR  excessKurt_DMSNR  skewness_DMSNR  
0       14.860146         10.576487      127.393580  
1       21.744669          7.735822       63.171909  
2       20.959280          6.896499       53.593661  
3       11.468720         14.269573      252.567306  
4       14.545074         10.621748      131.394004  
   pulsar
0       0
1       0
2       0
3       0
4       0
       pulsar
17892       0
17893       0
17894       0
17895       0
17896       0


In [3]:
X = pulsar_data.to_numpy()
print(X)

[[ 1.02507812e+02  5.88824300e+01  4.65318154e-01 ...  1.48601457e+01
   1.05764867e+01  1.27393580e+02]
 [ 1.03015625e+02  3.93416494e+01  3.23328365e-01 ...  2.17446687e+01
   7.73582202e+00  6.31719091e+01]
 [ 1.36750000e+02  5.71784487e+01 -6.84146380e-02 ...  2.09592803e+01
   6.89649891e+00  5.35936607e+01]
 ...
 [ 1.19335938e+02  5.99359394e+01  1.59363100e-01 ...  5.88720002e+01
   2.49951714e+00  4.59517265e+00]
 [ 1.14507812e+02  5.39023997e+01  2.01161383e-01 ...  1.33817306e+01
   1.00079673e+01  1.34238910e+02]
 [ 5.70625000e+01  8.57973403e+01  1.40639105e+00 ...  6.47125623e+01
  -1.59752658e+00  1.42947536e+00]]


In [4]:
X_tr = preprocessing.RobustScaler().fit_transform(X)
X_ts = preprocessing.StandardScaler().fit_transform(X)
X_tp = preprocessing.PowerTransformer().fit_transform(X)
X_tq = preprocessing.QuantileTransformer(output_distribution='normal').fit_transform(X)
print(X_tr)
print(X_ts)
print(X_tp)
print(X_tq)

[[-0.48058542  1.38028895  0.5424794  ... -0.25729947  0.43535915
   0.424753  ]
 [-0.46117085 -0.87942411  0.22428914 ...  0.23477475 -0.14183722
  -0.19066953]
 [ 0.82855436  1.18323905 -0.65358247 ...  0.17863877 -0.31237979
  -0.28245583]
 ...
 [ 0.16278375  1.5021177  -0.14314678 ...  2.88846658 -1.20580516
  -0.75199786]
 [-0.02180406  0.80439386 -0.04947944 ... -0.36296982  0.31984135
   0.49035033]
 [-2.21804062  4.49275285  2.651365   ...  3.30592324 -2.03828594
  -0.78233406]]
[[-0.33410698  1.80237928 -0.01182199 ... -0.58893062  0.50440885
   0.21155997]
 [-0.31431076 -1.05327065 -0.14526773 ... -0.2353432  -0.12599666
  -0.39137949]
 [ 1.0007676   1.55336292 -0.51343811 ... -0.27568056 -0.31226078
  -0.48130403]
 ...
 [ 0.32190941  1.956337   -0.2993666  ...  1.67150747 -1.28804701
  -0.94132199]
 [ 0.13369301  1.07460777 -0.26008348 ... -0.66486165  0.37824198
   0.27582674]
 [-2.1057167   5.73566952  0.87262308 ...  1.97147732 -2.19727045
  -0.97104286]]
[[-0.52297968  1

In [8]:
db_raw = cluster.DBSCAN(eps=1.0, min_samples=100).fit_predict(X)
db_s = cluster.DBSCAN(eps=1.0, min_samples=100).fit(X_ts)
db_r = cluster.DBSCAN(eps=1.0, min_samples=100).fit(X_tr)
db_p = cluster.DBSCAN(eps=1.0, min_samples=100).fit(X_tp)
db_q = cluster.DBSCAN(eps=1.0, min_samples=100).fit(X_tq)
print(np.unique(db_raw.labels_, return_counts=True))
print(np.unique(db_s.labels_, return_counts=True))
print(np.unique(db_r.labels_, return_counts=True))
print(np.unique(db_p.labels_, return_counts=True))
print(np.unique(db_q.labels_, return_counts=True))

(array([-1]), array([17897]))
(array([-1,  0]), array([ 1832, 16065]))
(array([-1,  0]), array([ 3098, 14799]))
(array([-1,  0]), array([  786, 17111]))
(array([-1,  0]), array([ 1189, 16708]))


In [9]:
# sc_raw = cluster.SpectralClustering(2, affinity='nearest_neighbors', n_jobs=-1).fit(X)
sc_s = cluster.SpectralClustering(2, affinity='nearest_neighbors', n_jobs=-1).fit(X_ts)
sc_r = cluster.SpectralClustering(2, affinity='nearest_neighbors', n_jobs=-1).fit(X_tr)
sc_p = cluster.SpectralClustering(2, affinity='nearest_neighbors', n_jobs=-1).fit(X_tp)
sc_q = cluster.SpectralClustering(2, affinity='nearest_neighbors', n_jobs=-1).fit(X_tq)
#print(np.unique(sc_raw.labels_, return_counts=True))
print(np.unique(sc_s.labels_, return_counts=True))
print(np.unique(sc_r.labels_, return_counts=True))
print(np.unique(sc_p.labels_, return_counts=True))
print(np.unique(sc_q.labels_, return_counts=True))

(array([0, 1], dtype=int32), array([ 2053, 15844]))
(array([0, 1], dtype=int32), array([15992,  1905]))
(array([0, 1], dtype=int32), array([16749,  1148]))
(array([0, 1], dtype=int32), array([16543,  1354]))


In [10]:
# ag_raw = cluster.AgglomerativeClustering(linkage='ward', n_clusters=2).fit(X)
ag_s = cluster.AgglomerativeClustering(linkage='ward', n_clusters=2).fit(X_ts)
ag_r = cluster.AgglomerativeClustering(linkage='ward', n_clusters=2).fit(X_tr)
ag_p = cluster.AgglomerativeClustering(linkage='ward', n_clusters=2).fit(X_tp)
ag_q = cluster.AgglomerativeClustering(linkage='ward', n_clusters=2).fit(X_tq)
# print(np.unique(ag_raw.labels_, return_counts=True))
print(np.unique(ag_s.labels_, return_counts=True))
print(np.unique(ag_r.labels_, return_counts=True))
print(np.unique(ag_p.labels_, return_counts=True))
print(np.unique(ag_q.labels_, return_counts=True))

(array([0, 1]), array([17074,   823]))
(array([0, 1]), array([ 1418, 16479]))
(array([0, 1]), array([ 6315, 11582]))
(array([0, 1]), array([ 7508, 10389]))


In [11]:
labels_truth = ground_truth.to_numpy().transpose()[0]
print(np.unique(labels_truth, return_counts=True))
print(np.unique(np.less(db_s.labels_, labels_truth), return_counts=True))
print(np.unique(np.less(db_r.labels_, labels_truth), return_counts=True))
print(np.unique(np.less(db_p.labels_, labels_truth), return_counts=True))
print(np.unique(np.less(db_q.labels_, labels_truth), return_counts=True))
db_s.labels_[db_s.labels_ == 0 ] = 1
db_s.labels_[db_s.labels_ == -1 ] = 0
db_r.labels_[db_r.labels_ == 0 ] = 1
db_r.labels_[db_r.labels_ == -1 ] = 0
db_p.labels_[db_p.labels_ == 0 ] = 1
db_p.labels_[db_p.labels_ == -1 ] = 0
db_q.labels_[db_q.labels_ == 0 ] = 1
db_q.labels_[db_q.labels_ == -1 ] = 0
print('DBSCAN Metrics:')
labels_pred = db_s.labels_
print('z-score\\')
rand_s_db = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_s_db)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_s_db = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_s_db)
labels_pred = db_r.labels_
print('robust_scalar\\')
rand_r_db = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_r_db)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_r_db = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_r_db)
labels_pred = db_p.labels_
print('power_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))
labels_pred = db_q.labels_
print('quantile_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))

(array([0, 1]), array([16258,  1639]))
(array([False,  True]), array([15532,  2365]))
(array([False,  True]), array([14621,  3276]))
(array([False,  True]), array([15543,  2354]))
(array([False,  True]), array([15175,  2722]))
DBSCAN Metrics:
z-score\
rand score =  0.8691961169848157
adjusted mutual info score =  0.3353422387796893
fowlkes-mallow =  0.9207676098201846
robust_scalar\
rand score =  0.8177319809132819
adjusted mutual info score =  0.34579179541470323
fowlkes-mallow =  0.8848640541862351
power_scalar\
rand score =  0.7774057851378182
adjusted mutual info score =  -0.00011338809742985674
fowlkes-mallow =  0.8737456409746072
quantile_scalar\
rand score =  0.7503777451606869
adjusted mutual info score =  -9.243087768534326e-05
fowlkes-mallow =  0.8542460878255708


In [14]:
print(np.unique(sc_s.labels_, return_counts=True))
print(np.unique(sc_r.labels_, return_counts=True))
print(np.unique(np.less(sc_r.labels_, labels_truth), return_counts=True))
sc_r.labels_[sc_r.labels_ == 1 ] = -1
sc_r.labels_[sc_r.labels_ == 0 ] = 1
sc_r.labels_[sc_r.labels_ == -1 ] = 0
print(np.unique(sc_p.labels_, return_counts=True))
print(np.unique(np.less(sc_p.labels_, labels_truth), return_counts=True))
sc_p.labels_[sc_p.labels_ == 1 ] = -1
sc_p.labels_[sc_p.labels_ == 0 ] = 1
sc_p.labels_[sc_p.labels_ == -1 ] = 0
print(np.unique(sc_q.labels_, return_counts=True))
print(np.unique(np.less(sc_q.labels_, labels_truth), return_counts=True))
sc_q.labels_[sc_q.labels_ == 1 ] = -1
sc_q.labels_[sc_q.labels_ == 0 ] = 1
sc_q.labels_[sc_q.labels_ == -1 ] = 0
print('Spectral Clustering Metrics:')
labels_pred = sc_s.labels_
print('z-score\\')
rand_s_sp = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_s_sp)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_s_sp = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_s_sp)
labels_pred = sc_r.labels_
print('robust_scalar\\')
rand_r_sp = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_s_sp)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_r_sp = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_r_sp)
labels_pred = sc_p.labels_
print('power_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))
labels_pred = sc_q.labels_
print('quantile_scalar\\')
rand_q_sp = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_q_sp)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_q_sp = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_q_sp)

(array([0, 1], dtype=int32), array([ 2053, 15844]))
(array([0, 1], dtype=int32), array([ 1905, 15992]))
(array([False,  True]), array([16813,  1084]))
(array([0, 1], dtype=int32), array([ 1148, 16749]))
(array([False,  True]), array([16780,  1117]))
(array([0, 1], dtype=int32), array([16543,  1354]))
(array([False,  True]), array([17493,   404]))
Spectral Clustering Metrics:
z-score\
rand score =  0.8760459600082317
adjusted mutual info score =  0.38588422416735835
fowlkes-mallow =  0.9242117562975304
robust_scalar\
rand score =  0.8760459600082317
adjusted mutual info score =  0.3059737760359129
fowlkes-mallow =  0.9137161375039314
power_scalar\
rand score =  0.9401080873319986
adjusted mutual info score =  0.6184093669305695
fowlkes-mallow =  0.9654007020066465
quantile_scalar\
rand score =  0.9432592211894273
adjusted mutual info score =  0.6184348081852952
fowlkes-mallow =  0.9666181362283472


In [18]:
print(np.unique(ag_s.labels_, return_counts=True))
print(np.unique(np.less(ag_s.labels_, labels_truth), return_counts=True))
ag_s.labels_[ag_s.labels_ == 1 ] = -1
ag_s.labels_[ag_s.labels_ == 0 ] = 1
ag_s.labels_[ag_s.labels_ == -1 ] = 0
print(np.unique(ag_r.labels_, return_counts=True))
print(np.unique(ag_p.labels_, return_counts=True))
print(np.unique(ag_q.labels_, return_counts=True))
print('Agglomerative Clustering Metrics:')
labels_pred = ag_s.labels_
print('z-score\\')
rand_s_ag = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_s_ag)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_s_ag = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_s_ag)
labels_pred = ag_r.labels_
print('robust_scalar\\')
rand_r_ag = metrics.rand_score(labels_truth, labels_pred)
print('rand score = ', rand_r_ag)
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
fm_r_ag = metrics.fowlkes_mallows_score(labels_truth, labels_pred)
print('fowlkes-mallow = ', fm_r_ag)
labels_pred = ag_p.labels_
print('power_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))
labels_pred = ag_q.labels_
print('quantile_scalar\\')
print('rand score = ', metrics.rand_score(labels_truth, labels_pred))
print('adjusted mutual info score = ', metrics.adjusted_mutual_info_score(labels_truth, labels_pred))
print('fowlkes-mallow = ', metrics.fowlkes_mallows_score(labels_truth, labels_pred))

(array([0, 1]), array([  823, 17074]))
(array([False,  True]), array([17085,   812]))
(array([0, 1]), array([ 1418, 16479]))
(array([0, 1]), array([ 6315, 11582]))
(array([0, 1]), array([ 7508, 10389]))
Agglomerative Clustering Metrics:
z-score\
rand score =  0.9107329106610621
adjusted mutual info score =  0.4787160075295218
fowlkes-mallow =  0.949833515016091
robust_scalar\
rand score =  0.85511996588835
adjusted mutual info score =  0.24407711090769255
fowlkes-mallow =  0.9142221346057176
power_scalar\
rand score =  0.5999081467241558
adjusted mutual info score =  0.14903181871034105
fowlkes-mallow =  0.7257392163574735
quantile_scalar\
rand score =  0.5505325149581288
adjusted mutual info score =  0.11928905525280653
fowlkes-mallow =  0.6859396945269424


In [20]:
SP_DB_AG_Metrics = pd.DataFrame([["Spectral Clustering", "z-score", rand_s_sp, fm_s_sp], 
                               ["Spectral Clustering", "robust scalar", rand_r_sp, fm_r_sp],
                               ["Spectral Clustering", "quantile scalar", rand_q_sp, fm_q_sp],
                               ["DBSCAN Clustering", "z-score", rand_s_db, fm_s_db], 
                               ["DBSCAN Clustering", "robust scalar", rand_r_db, fm_r_db], 
                               ["Agglomerative Clustering", "z-score", rand_s_ag, fm_s_ag], 
                               ["Agglomerative Clustering", "robust scalar", rand_r_ag, fm_r_ag]],
                               columns=["Clustering Algorithm", "Normalization Method", "Rand Index", "Fowlkes-Mallow"])

SP_DB_AG_Metrics.to_csv(path + "/output/SP_DB_AG_metrics.csv")
SP_DB_AG_Metrics

Unnamed: 0,Clustering Algorithm,Normalization Method,Rand Index,Fowlkes-Mallow
0,Spectral Clustering,z-score,0.876046,0.924212
1,Spectral Clustering,robust scalar,0.858046,0.913716
2,Spectral Clustering,quantile scalar,0.943259,0.966618
3,DBSCAN Clustering,z-score,0.869196,0.920768
4,DBSCAN Clustering,robust scalar,0.817732,0.884864
5,Agglomerative Clustering,z-score,0.910733,0.949834
6,Agglomerative Clustering,robust scalar,0.85512,0.914222
