Phython code for comparing performance of 4 clustering applied to 3 dataset with different properties

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import seaborn as sns
from sklearn.cluster import KMeans, OPTICS
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, silhouette_samples
from scipy.spatial.distance import pdist, squareform
from mpl_toolkits.mplot3d import Axes3D
from yellowbrick.cluster import SilhouetteVisualizer
import umap
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import torch

from random import sample

import statsmodels.api as sm
from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering

import time

In [2]:
# read input files 
device = torch.device("mps")
url = 'CC GENERAL.csv'
df = pd.read_csv(os.getcwd()+'/Clustering/'+url)

# find structure of data set
print(df.shape)
print(df.describe())

(8950, 18)
            BALANCE  BALANCE_FREQUENCY     PURCHASES  ONEOFF_PURCHASES  \
count   8950.000000        8950.000000   8950.000000       8950.000000   
mean    1564.474828           0.877271   1003.204834        592.437371   
std     2081.531879           0.236904   2136.634782       1659.887917   
min        0.000000           0.000000      0.000000          0.000000   
25%      128.281915           0.888889     39.635000          0.000000   
50%      873.385231           1.000000    361.280000         38.000000   
75%     2054.140036           1.000000   1110.130000        577.405000   
max    19043.138560           1.000000  49039.570000      40761.250000   

       INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
count             8950.000000   8950.000000          8950.000000   
mean               411.067645    978.871112             0.490351   
std                904.338115   2097.163877             0.401371   
min                  0.000000      0.000000       

In [3]:
# check the empty values : if (values is not 0) then empty
df.isna().mean()*100

CUST_ID                             0.000000
BALANCE                             0.000000
BALANCE_FREQUENCY                   0.000000
PURCHASES                           0.000000
ONEOFF_PURCHASES                    0.000000
INSTALLMENTS_PURCHASES              0.000000
CASH_ADVANCE                        0.000000
PURCHASES_FREQUENCY                 0.000000
ONEOFF_PURCHASES_FREQUENCY          0.000000
PURCHASES_INSTALLMENTS_FREQUENCY    0.000000
CASH_ADVANCE_FREQUENCY              0.000000
CASH_ADVANCE_TRX                    0.000000
PURCHASES_TRX                       0.000000
CREDIT_LIMIT                        0.011173
PAYMENTS                            0.000000
MINIMUM_PAYMENTS                    3.497207
PRC_FULL_PAYMENT                    0.000000
TENURE                              0.000000
dtype: float64

In [4]:
# handling empty values

df.loc[(df['MINIMUM_PAYMENTS'].isnull()==True),'MINIMUM_PAYMENTS']=df['MINIMUM_PAYMENTS'].mean()
df.loc[(df['CREDIT_LIMIT'].isnull()==True),'CREDIT_LIMIT']=df['CREDIT_LIMIT'].mean()

In [5]:
df.isna().mean()*100

CUST_ID                             0.0
BALANCE                             0.0
BALANCE_FREQUENCY                   0.0
PURCHASES                           0.0
ONEOFF_PURCHASES                    0.0
INSTALLMENTS_PURCHASES              0.0
CASH_ADVANCE                        0.0
PURCHASES_FREQUENCY                 0.0
ONEOFF_PURCHASES_FREQUENCY          0.0
PURCHASES_INSTALLMENTS_FREQUENCY    0.0
CASH_ADVANCE_FREQUENCY              0.0
CASH_ADVANCE_TRX                    0.0
PURCHASES_TRX                       0.0
CREDIT_LIMIT                        0.0
PAYMENTS                            0.0
MINIMUM_PAYMENTS                    0.0
PRC_FULL_PAYMENT                    0.0
TENURE                              0.0
dtype: float64

In [6]:
# dropping header that are not necessary for clustering
if 'CUST_ID' in df.columns :
    df = df.drop(columns=['CUST_ID'])

In [7]:
df.shape

(8950, 17)

In [8]:
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,864.206542,0.0,12
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [9]:
# functions for exploratory data analysis

# function for skewness with data, method(normal|log|sqrt), draw plot(True|False)
#    Symmetrical (-0.5 < skewness < 0.5), 
#    moderately skewed (-1 to -0.5, 0.5 to 1), 
#    otherwise highly skewed.

def skewness(df, method = 'normal', plot='Fale') :
    skew_r = {}
    if (method == 'normal') :
        for feature in df:
            tempData = df.sort_values(by=feature)[feature]
            skew_r[feature] = tempData.skew()
           
        if plot=='True':
            sns.displot(tempData)

    if (method == 'log') : 
        for feature in df:
            tempData = np.log(df.sort_values(by=feature)[feature])
            skew_r[feature] = tempData.skew()
       
        if plot=='True':
            sns.displot(tempData)

    if (method == 'sqrt') : 
        for feature in df:
            tempData = np.sqrt(df.sort_values(by=feature)[feature])
            skew_r[feature] = tempData.skew()
        
        if plot=='True':
            sns.displot(tempData)
           
    return skew_r

# check whether the dataset is suitable for clustering. 
# function for calculating hopkins static used to check whether the dataset is randomly distributed
# A value close to 0.5 indicates that the data has a random distribution, and therefore is not suitable for clustering. 
# On the other hand, if the value is greater than 0.7, the dataset is likely suitable for clustering.

def hopkins(X):
    d = X.shape[1] # columns
    n = X.shape[0] # rows

    m = int(0.1 * n) # heuristic from article [1]

    nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(X)
    rand_X = sample(range(0, n, 1), m)

    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(np.random.normal(size=(1, d)).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X[rand_X[j]].reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])

    H = sum(ujd) / (sum(ujd) + sum(wjd))
    return  H


# functions for outlier remove with dataframe, method(IQR|z_score)
# remove outlier based on quartiles : IQR(Interquantile Range)
# IQR = Q3(상위 25%)-Q1(하위 25%)
# 이상치 = (Q1-1.5*IQR) 보다 작거나 Q3+1.5∗IQR보다 큰 경우
# 1.5 대신 다른 수를 곱할 수도 있음

def outlier_remove(df, method='IQR', threshold=1.5):
    
    # initialization
    outlier_percentage = {}
    df_rows = np.zeros(df.shape[0])    # outlier가 포함되어 있는 행에 대한 정보 유지
    df_filtered = df.copy()            # outlier가 제외된 데이터셋

# remove outlier based on quartiles : IQR(Interquantile Range)
# IQR = Q3(upper 25%)-Q1(lower 25%)
# outlier = ((Q1-1.5*IQR) > values) | (value > (Q3+1.5∗IQR))
# defalut threshold = 1.5
    
    if (method == 'IQR'):
        for feature in df:
            tempData = df[feature]
            data = tempData.to_numpy()
            Q1, Q3 = tempData.quantile([0.25, 0.75])
            IQR = Q3 - Q1
            Lower_range = Q1 - (threshold * IQR)       
            Upper_range = Q3 + (threshold * IQR)
            # outlier 갯수 구하기
            filtered_data = data[(Lower_range < tempData) & (tempData < Upper_range)] 
            outlier = data[(tempData < Lower_range) | (tempData > Upper_range)]
            outlier_percentage[feature] = round((len(outlier)/len(data)) * 100, 2)

            for i in range(df.shape[0]):
                if ((Lower_range > data[i]) | (data[i] > Upper_range)):
                    df_rows[i] = 1

# remove outlier based on z-score
# How many standard deviations the data deviates from the mean.
# Classify anything over 3 as an outlier based on the normal absolute value
# Z = (x-u)/a  x : data, u : mean, a : stansard deviation
# defalut threshold = 3
    if (method == 'z_score'):
        for feature in df:
            data = df[feature].to_numpy()
            z_scores = np.abs(data - np.mean(data)) / np.std(data) # Z-score 계산      
            filtered_data = data[z_scores < threshold]
            outlier = data[z_scores>threshold]
            outlier_percentage[feature] = round((len(outlier)/len(data)) * 100, 2)

            # outlier 되어야 할 행에 대한 정보 관리
            for i in range(df.shape[0]):
                if (z_scores>threshold):
                    df_rows[i] = 1
        
    return df_filtered, outlier_percentage

In [10]:
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,864.206542,0.0,12
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [11]:
# function for scaling
def scaling(df) :
    SS=StandardScaler()
    df_scaled=SS.fit_transform(df[df.columns])
    
    return df_scaled

In [12]:
df_scaled = scaling(df)

In [13]:
pd.DataFrame(df_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.731989,-0.249434,-0.4249,-0.356934,-0.349079,-0.466786,-0.80649,-0.678661,-0.707313,-0.675349,-0.47607,-0.511333,-0.960433,-0.528979,-0.3109675,-0.525551,0.36068
1,0.786961,0.134325,-0.469552,-0.356934,-0.454576,2.605605,-1.221758,-0.678661,-0.916995,0.573963,0.110074,-0.591796,0.688639,0.818642,0.08931021,0.234227,0.36068
2,0.447135,0.518084,-0.107668,0.108889,-0.454576,-0.466786,1.269843,2.673451,-0.916995,-0.675349,-0.47607,-0.10902,0.826062,-0.383805,-0.1016632,-0.525551,0.36068
3,0.049099,-1.016953,0.232058,0.546189,-0.454576,-0.368653,-1.014125,-0.399319,-0.916995,-0.258913,-0.329534,-0.551565,0.826062,-0.598688,4.8783050000000006e-17,-0.525551,0.36068
4,-0.358775,0.518084,-0.462063,-0.347294,-0.454576,-0.466786,-1.014125,-0.399319,-0.916995,-0.675349,-0.47607,-0.551565,-0.905464,-0.364368,-0.2657913,-0.525551,0.36068


In [14]:
#skewness normal
skew_r = skewness(df, method = 'normal', plot='False')
print(skew_r)

{'BALANCE': 2.393386042571806, 'BALANCE_FREQUENCY': -2.0232655185149087, 'PURCHASES': 8.144269064554654, 'ONEOFF_PURCHASES': 10.045082884700284, 'INSTALLMENTS_PURCHASES': 7.299119908745641, 'CASH_ADVANCE': 5.166609074074723, 'PURCHASES_FREQUENCY': 0.06016423585803502, 'ONEOFF_PURCHASES_FREQUENCY': 1.5356127835248508, 'PURCHASES_INSTALLMENTS_FREQUENCY': 0.5092011649999885, 'CASH_ADVANCE_FREQUENCY': 1.8286862664778514, 'CASH_ADVANCE_TRX': 5.721298203192299, 'PURCHASES_TRX': 4.630655265932403, 'CREDIT_LIMIT': 1.5225490377498587, 'PAYMENTS': 5.907619794397565, 'MINIMUM_PAYMENTS': 13.86735725976256, 'PRC_FULL_PAYMENT': 1.9428199409718574, 'TENURE': -2.9430172876199148}


In [15]:
#skewness log
skew_r = skewness(pd.DataFrame(df_scaled), method = 'log', plot='False')
print(skew_r)

{0: -0.998832409949357, 1: -3.4958540022682576, 2: -0.8221609053186226, 3: -0.5835662607368457, 4: -0.7707120108162759, 5: -0.9166399451572796, 6: -2.0552516173973467, 7: -0.6390591221890417, 8: -1.343234825580168, 9: -0.39507671009877393, 10: -0.00820829134562055, 11: -0.7087886787688276, 12: nan, 13: -0.8487076138690102, 14: -2.130286645439263, 15: -1.1068820846108198, 16: 0.0}


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  adjusted = values - mean


In [16]:
#skewness sqrt
skew_r = skewness(pd.DataFrame(df_scaled), method = 'sqrt', plot='False')
print(skew_r)

{0: 0.594303979811689, 1: -3.2766005479602924, 2: 1.8889739592521435, 3: 2.154745155891824, 4: 1.677140727145852, 5: 1.166932744380721, 6: -1.2885511304771178, 7: -0.10386893664029657, 8: -0.8540548272145774, 9: 0.43772404931878794, 10: 1.4849128683241837, 11: 1.2790854016996753, 12: 0.2596556590872664, 13: 1.5825141000264942, 14: 2.6431518703085275, 15: -0.34559731599400995, 16: 0.0}


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [17]:
pd.DataFrame(df_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.731989,-0.249434,-0.4249,-0.356934,-0.349079,-0.466786,-0.80649,-0.678661,-0.707313,-0.675349,-0.47607,-0.511333,-0.960433,-0.528979,-0.3109675,-0.525551,0.36068
1,0.786961,0.134325,-0.469552,-0.356934,-0.454576,2.605605,-1.221758,-0.678661,-0.916995,0.573963,0.110074,-0.591796,0.688639,0.818642,0.08931021,0.234227,0.36068
2,0.447135,0.518084,-0.107668,0.108889,-0.454576,-0.466786,1.269843,2.673451,-0.916995,-0.675349,-0.47607,-0.10902,0.826062,-0.383805,-0.1016632,-0.525551,0.36068
3,0.049099,-1.016953,0.232058,0.546189,-0.454576,-0.368653,-1.014125,-0.399319,-0.916995,-0.258913,-0.329534,-0.551565,0.826062,-0.598688,4.8783050000000006e-17,-0.525551,0.36068
4,-0.358775,0.518084,-0.462063,-0.347294,-0.454576,-0.466786,-1.014125,-0.399319,-0.916995,-0.675349,-0.47607,-0.551565,-0.905464,-0.364368,-0.2657913,-0.525551,0.36068


In [18]:
# measuring the cluster tendency of a data set

hopkins_score = hopkins(df_scaled)
print(hopkins_score)

0.7989505121522804


In [19]:
# measuring and removing the outlier of a data set

df_filtered, outlier_percentage = outlier_remove(df, method='IQR', threshold=1.5)

print(outlier_percentage)

{'BALANCE': 7.77, 'BALANCE_FREQUENCY': 16.68, 'PURCHASES': 9.03, 'ONEOFF_PURCHASES': 11.32, 'INSTALLMENTS_PURCHASES': 9.69, 'CASH_ADVANCE': 11.51, 'PURCHASES_FREQUENCY': 0.0, 'ONEOFF_PURCHASES_FREQUENCY': 8.74, 'PURCHASES_INSTALLMENTS_FREQUENCY': 0.0, 'CASH_ADVANCE_FREQUENCY': 5.87, 'CASH_ADVANCE_TRX': 8.98, 'PURCHASES_TRX': 8.56, 'CREDIT_LIMIT': 2.77, 'PAYMENTS': 9.03, 'MINIMUM_PAYMENTS': 8.65, 'PRC_FULL_PAYMENT': 16.47, 'TENURE': 15.26}


In [20]:
# dictionary for storing results

results = {
    "KMeans": {
        "normal": {},
        "umap": {},
        "pca": {},
    },
    "DBSCAN": {
        "normal": {},
        "umap": {},
        "pca": {}
    },
    "OPTICS": {
        "normal": {},
        "umap": {},
        "pca": {}
    },
    "GMM": {
        "normal": {},
        "umap": {},
        "pca": {}
    },
    "AGL": {
        "normal": {},
        "umap": {},
        "pca": {}
    }
}

K-Means Normal with original dataset

In [21]:
n = 3

start = time.time()
kmeans_normal = KMeans(n_clusters=n, init='k-means++', max_iter=300, n_init='auto', random_state=0)
kmeans_labels_normal = kmeans_normal.fit_predict(df_scaled)  
end = time.time()

results["KMeans"]["normal"]["Silhouette Coefficient"] = silhouette_score(df_scaled, kmeans_labels_normal)
results["KMeans"]["normal"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_scaled, kmeans_labels_normal)
results["KMeans"]["normal"]["Davies-Bouldin Index"] = davies_bouldin_score(df_scaled, kmeans_labels_normal)
results["KMeans"]["normal"]["Time"] = end - start

for metric, value in results["KMeans"]["normal"].items():
    print(f"{metric}: {value:.2f}")


Silhouette Coefficient: 0.25
Calinski-Harabasz Index: 1604.40
Davies-Bouldin Index: 1.60
Time: 0.30


K-Means with UMAP

In [22]:
distance = "euclidean" # "euclidean","cosine","correlation"

reducer = umap.UMAP(metric = distance)
embedding = reducer.fit_transform(df_scaled)

df_umap = pd.DataFrame(embedding)

In [23]:
start = time.time()
kmeans_umap = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans_labels_umap = kmeans_umap.fit_predict(df_umap)
end = time.time()

results["KMeans"]["umap"]["Silhouette Coefficient"] = silhouette_score(df_umap, kmeans_labels_umap)
results["KMeans"]["umap"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_umap, kmeans_labels_umap)
results["KMeans"]["umap"]["Davies-Bouldin Index"] = davies_bouldin_score(df_umap, kmeans_labels_umap)
results["KMeans"]["umap"]["Time"] = end-start

for metric, value in results["KMeans"]["umap"].items():
    print(f"{metric}: {value:.2f}")


Silhouette Coefficient: 0.40
Calinski-Harabasz Index: 8157.89
Davies-Bouldin Index: 0.88
Time: 0.32


K-Means with PCA

In [24]:
k=2
pca = PCA(n_components=k)
df_pca = pca.fit_transform(df_scaled)

In [25]:
start = time.time()
kmeans_pca = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans_labels_pca = kmeans_pca.fit_predict(df_pca)
end = time.time()

results["KMeans"]["pca"]["Silhouette Coefficient"] = silhouette_score(df_pca, kmeans_labels_pca)
results["KMeans"]["pca"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_pca, kmeans_labels_pca)
results["KMeans"]["pca"]["Davies-Bouldin Index"] = davies_bouldin_score(df_pca, kmeans_labels_pca)
results["KMeans"]["pca"]["Time"] = end - start

print('components=', kmeans_pca.labels_)
for metric, value in results["KMeans"]["pca"].items():
    print(f"{metric}: {value:.2f}")

components= [0 1 0 ... 0 0 0]
Silhouette Coefficient: 0.45
Calinski-Harabasz Index: 5337.48
Davies-Bouldin Index: 0.81
Time: 1.05


DBSCAN with original dataset

In [28]:
start = time.time()
dbscan_normal = DBSCAN(eps=2.2, min_samples=5)
dbscan_labels_normal = dbscan_normal.fit_predict(df_scaled)
end = time.time()

results["DBSCAN"]["normal"]["Silhouette Coefficient"] = silhouette_score(df_scaled, dbscan_labels_normal) if len(np.unique(dbscan_labels_normal)) > 1 else 0
results["DBSCAN"]["normal"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_scaled, dbscan_labels_normal) if len(np.unique(dbscan_labels_normal)) > 1 else 0
results["DBSCAN"]["normal"]["Davies-Bouldin Index"] = davies_bouldin_score(df_scaled, dbscan_labels_normal) if len(np.unique(dbscan_labels_normal)) > 1 else 0
results["DBSCAN"]["normal"]["Time"] = end - start

for metric, value in results["DBSCAN"]["normal"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.52
Calinski-Harabasz Index: 939.76
Davies-Bouldin Index: 1.97
Time: 0.19


DBSCAN with UMAP

In [29]:
start = time.time()
dbscan_umap = DBSCAN(eps=2.23, min_samples=5)
dbscan_labels_umap = dbscan_umap.fit_predict(df_umap)
end = time.time()

results["DBSCAN"]["umap"]["Silhouette Coefficient"] = silhouette_score(df_umap, dbscan_labels_umap) if len(np.unique(dbscan_labels_umap)) > 1 else 0
results["DBSCAN"]["umap"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_umap, dbscan_labels_umap)
results["DBSCAN"]["umap"]["Davies-Bouldin Index"] = davies_bouldin_score(df_umap, dbscan_labels_umap)
results["DBSCAN"]["umap"]["Time"] = end - start

for metric, value in results["DBSCAN"]["umap"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.26
Calinski-Harabasz Index: 239.44
Davies-Bouldin Index: 0.61
Time: 0.17


DBSCAN with PCA

In [32]:
start = time.time()
dbscan_pca = DBSCAN(eps=1, min_samples=5)
dbscan_labels_pca = dbscan_pca.fit_predict(df_pca)
end = time.time()

results["DBSCAN"]["pca"]["Silhouette Coefficient"] = silhouette_score(df_pca, dbscan_labels_pca) if len(np.unique(dbscan_labels_pca)) > 1 else 0
results["DBSCAN"]["pca"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_pca, dbscan_labels_pca)
results["DBSCAN"]["pca"]["Davies-Bouldin Index"] = davies_bouldin_score(df_pca, dbscan_labels_pca)
results["DBSCAN"]["pca"]["Time"] = end - start

for metric, value in results["DBSCAN"]["pca"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.80
Calinski-Harabasz Index: 1347.23
Davies-Bouldin Index: 0.78
Time: 0.17


In [33]:
start = time.time()
gmm_normal = GaussianMixture(n_components=3, covariance_type='full', init_params='kmeans', random_state=42, n_init=10)
gmm_labels_normal = gmm_normal.fit_predict(df_scaled)
end = time.time()

results["GMM"]["normal"]["Silhouette Coefficient"] = silhouette_score(df_scaled, gmm_labels_normal)
results["GMM"]["normal"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_scaled, gmm_labels_normal)
results["GMM"]["normal"]["Davies-Bouldin Index"] = davies_bouldin_score(df_scaled, gmm_labels_normal)
results["GMM"]["normal"]["Time"] = end - start

for metric, value in results["KMeans"]["normal"].items():
    print(f"{metric}: {value:.2f}")
    
#fig = plt.figure(figsize=(8,8))
#fig.set_facecolor('white')
#plt.scatter(df_scaled[:,0], df_scaled[:,1], c=labels)
#plt.show()

Silhouette Coefficient: 0.25
Calinski-Harabasz Index: 1604.40
Davies-Bouldin Index: 1.60
Time: 0.30


GMM with UMAP

In [34]:
start = time.time()
gmm_umap = GaussianMixture(n_components=3, covariance_type='full', init_params='kmeans', random_state=42, n_init=10)
gmm_labels_umap = gmm_umap.fit_predict(df_umap)
end = time.time()

results["GMM"]["umap"]["Silhouette Coefficient"] = silhouette_score(df_umap, gmm_labels_umap)
results["GMM"]["umap"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_umap, gmm_labels_umap)
results["GMM"]["umap"]["Davies-Bouldin Index"] = davies_bouldin_score(df_umap, gmm_labels_umap)
results["GMM"]["umap"]["Time"] = end - start

for metric, value in results["GMM"]["umap"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.40
Calinski-Harabasz Index: 7111.81
Davies-Bouldin Index: 0.90
Time: 0.50


GMM with PCA

In [35]:
start = time.time()
gmm_pca = GaussianMixture(n_components=3, covariance_type='full', init_params='kmeans', random_state=42, n_init=10)
gmm_labels_pca = gmm_pca.fit_predict(df_pca)
end = time.time()

results["GMM"]["pca"]["Silhouette Coefficient"] = silhouette_score(df_pca, gmm_labels_pca)
results["GMM"]["pca"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_pca, gmm_labels_pca)
results["GMM"]["pca"]["Davies-Bouldin Index"] = davies_bouldin_score(df_pca, gmm_labels_pca)
results["GMM"]["pca"]["Time"] = end - start

for metric, value in results["GMM"]["pca"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.34
Calinski-Harabasz Index: 3506.88
Davies-Bouldin Index: 1.11
Time: 2.28


Agglomerative Hierachical Clustering with original dataset

In [36]:
start = time.time()
agl_normal = AgglomerativeClustering(n_clusters=3)
agl_labels_normal = agl_normal.fit_predict(df_scaled)
end = time.time()

results["AGL"]["normal"]["Silhouette Coefficient"] = silhouette_score(df_scaled, agl_labels_normal)
results["AGL"]["normal"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_scaled, agl_labels_normal)
results["AGL"]["normal"]["Davies-Bouldin Index"] = davies_bouldin_score(df_scaled, agl_labels_normal)
results["GMM"]["normal"]["Time"] = end - start

for metric, value in results["AGL"]["normal"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.17
Calinski-Harabasz Index: 1270.50
Davies-Bouldin Index: 1.80


Agglomerative Hierachical Clustering with UMAP

In [37]:
start = time.time()
agl_umap = AgglomerativeClustering(n_clusters=3)
agl_labels_umap = agl_umap.fit_predict(df_umap)
end = time.time()

results["AGL"]["umap"]["Silhouette Coefficient"] = silhouette_score(df_umap, agl_labels_umap)
results["AGL"]["umap"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_umap, agl_labels_umap)
results["AGL"]["umap"]["Davies-Bouldin Index"] = davies_bouldin_score(df_umap, agl_labels_umap)
results["GMM"]["umap"]["Time"] = end - start

for metric, value in results["AGL"]["umap"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.38
Calinski-Harabasz Index: 6586.92
Davies-Bouldin Index: 0.79


Agglomerative Hierachical Clustering with PCA

In [38]:
start = time.time()
agl_pca = AgglomerativeClustering(n_clusters=3)
agl_labels_pca = agl_pca.fit_predict(df_pca)
end = time.time()

results["AGL"]["pca"]["Silhouette Coefficient"] = silhouette_score(df_pca, agl_labels_pca)
results["AGL"]["pca"]["Calinski-Harabasz Index"] = calinski_harabasz_score(df_pca, agl_labels_pca)
results["AGL"]["pca"]["Davies-Bouldin Index"] = davies_bouldin_score(df_pca, agl_labels_pca)
results["GMM"]["pca"]["Time"] = end - start

for metric, value in results["AGL"]["pca"].items():
    print(f"{metric}: {value:.2f}")

Silhouette Coefficient: 0.34
Calinski-Harabasz Index: 3851.51
Davies-Bouldin Index: 0.95


Results

In [39]:
from IPython.display import display, Markdown

def display_results(results):
    display(Markdown(f"### {url}"))
    display(Markdown(f"### {hopkins_score}"))
    for method, data in results.items():
        display(Markdown(f"### {method}"))
        for dataset, metrics in data.items():
            display(Markdown(f"#### {dataset}"))
            for metric, value in metrics.items():
                display(Markdown(f"- **{metric}**: {value:.6f}"))
                
display_results(results)


### CC GENERAL.csv

### 0.7989505121522804

### KMeans

#### normal

- **Silhouette Coefficient**: 0.250556

- **Calinski-Harabasz Index**: 1604.396717

- **Davies-Bouldin Index**: 1.597271

- **Time**: 0.297652

#### umap

- **Silhouette Coefficient**: 0.404058

- **Calinski-Harabasz Index**: 8157.890377

- **Davies-Bouldin Index**: 0.876059

- **Time**: 0.318717

#### pca

- **Silhouette Coefficient**: 0.452367

- **Calinski-Harabasz Index**: 5337.484796

- **Davies-Bouldin Index**: 0.811241

- **Time**: 1.051538

### DBSCAN

#### normal

- **Silhouette Coefficient**: 0.520463

- **Calinski-Harabasz Index**: 939.758014

- **Davies-Bouldin Index**: 1.973882

- **Time**: 0.192844

#### umap

- **Silhouette Coefficient**: 0.259910

- **Calinski-Harabasz Index**: 239.436945

- **Davies-Bouldin Index**: 0.610066

- **Time**: 0.166286

#### pca

- **Silhouette Coefficient**: 0.796025

- **Calinski-Harabasz Index**: 1347.226109

- **Davies-Bouldin Index**: 0.784961

- **Time**: 0.167395

### OPTICS

#### normal

#### umap

#### pca

### GMM

#### normal

- **Silhouette Coefficient**: 0.113716

- **Calinski-Harabasz Index**: 920.088826

- **Davies-Bouldin Index**: 2.608650

- **Time**: 1.337385

#### umap

- **Silhouette Coefficient**: 0.404760

- **Calinski-Harabasz Index**: 7111.813213

- **Davies-Bouldin Index**: 0.898005

- **Time**: 0.923726

#### pca

- **Silhouette Coefficient**: 0.336897

- **Calinski-Harabasz Index**: 3506.879980

- **Davies-Bouldin Index**: 1.113599

- **Time**: 0.927119

### AGL

#### normal

- **Silhouette Coefficient**: 0.173110

- **Calinski-Harabasz Index**: 1270.503917

- **Davies-Bouldin Index**: 1.797923

#### umap

- **Silhouette Coefficient**: 0.384029

- **Calinski-Harabasz Index**: 6586.922990

- **Davies-Bouldin Index**: 0.789550

#### pca

- **Silhouette Coefficient**: 0.338239

- **Calinski-Harabasz Index**: 3851.510982

- **Davies-Bouldin Index**: 0.951048