<a href="https://colab.research.google.com/github/labrijisaad/Git-Clustering/blob/main/notebooks/Acuracy Benchmark/GIT Accuracy Benchmark - Small Scale Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Install the `packages` & Prepare the `data`

In [1]:
# !pip install -i "https://test.pypi.org/simple/" git_cluster > /dev/null 
# !pip install hdbscan > /dev/null
# !pip install -U gdown > /dev/null
# !gdown 1yNwCStP3Sdf2lfvNe9h0WIZw2OQ3O2UP && unzip datasets.zip > /dev/null

#### Import the Libraries

In [2]:
import warnings ; warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from utils import measures_calculator
from dataloaders import Real_DataLoader as Real_DataLoader

# Data Path ( TO CHANGE ! )
real_datasets_path = "../../datasets/real_datasets"

## <center><a><span style="color:green">Real Datasets - `Iris`</span></a></center>

### Load the Iris data

In [3]:
X_iris, Y_iris_true = Real_DataLoader(name='iris', path=real_datasets_path).load()
print(f'The total number of Clusters in the Iris Dataset is: {len(np.unique(Y_iris_true))}, and has {len(X_iris[0])} Features')

The total number of Clusters in the Iris Dataset is: 3, and has 4 Features


### <span style="color:red">GIT Clustring</span> <a class="anchor"></a>

In [4]:
from git_cluster import GIT

In [5]:
# Create an instance of the GIT clustering
git = GIT(k=15, target_ratio=[1, 1, 1])

# Apply the GIT algorithm to predict the clusters in the data
Y_iris_pred_git = git.fit_predict(X_iris)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_git_iris = measures_calculator(Y_iris_pred_git, Y_iris_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_git_iris["Algo"] = git.__class__.__name__
perf_metrics_git_iris["dataset"] = "iris"

perf_metrics_git_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.883153,0.706006,0.88,0.756087,1.0,3.0,GIT,iris


### <span style="color:red">HDBSCAN</span> <a class="anchor"></a>

In [6]:
import hdbscan

In [7]:
# Create an instance of the HDBSCAN clustering
hdbscan = hdbscan.HDBSCAN(min_cluster_size=30, 
                            min_samples=20, 
                            gen_min_span_tree=True)

# Apply the HDBSCAN algorithm to predict the clusters in the data
hdbscan.fit(X_iris)

# Get the predicted Clusters
Y_iris_pred_hdbscan = hdbscan.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_hdbscan_iris = measures_calculator(Y_iris_pred_hdbscan, Y_iris_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_hdbscan_iris["Algo"] = hdbscan.__class__.__name__
perf_metrics_hdbscan_iris["dataset"] = "iris"

perf_metrics_hdbscan_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.775507,0.562136,0.686667,0.713099,1.0,3.0,HDBSCAN,iris


### <span style="color:red">Spectral Clustering</span> <a class="anchor"></a>

In [8]:
from sklearn.cluster import SpectralClustering

In [9]:
# Create an instance of the Spectral Clustering algorithm 
spectral_clustering = SpectralClustering(n_clusters=3, assign_labels="discretize",
                                         random_state=0, affinity='rbf')

# Apply the Spectral Clustering algorithm to predict the clusters in the data
spectral_clustering.fit(X_iris)

# Get the predicted Clusters
Y_iris_pred_speclustr = spectral_clustering.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_speclustr_iris = measures_calculator(Y_iris_pred_speclustr, Y_iris_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_speclustr_iris["Algo"] = spectral_clustering.__class__.__name__
perf_metrics_speclustr_iris["dataset"] = "iris"

perf_metrics_speclustr_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.901225,0.743683,0.9,0.766036,1.0,3.0,SpectralClustering,iris


### <span style="color:red">K-Means</span> <a class="anchor"></a>

In [10]:
from sklearn.cluster import KMeans

In [11]:
# Create an instance of the K-means algorithm 
kmeans = KMeans(n_clusters=3, random_state=0)

# Apply the K-means algorithm to predict the clusters in the data
kmeans.fit(X_iris)

# Get the predicted Clusters
Y_iris_pred_kmeans = kmeans.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_kmeans_iris = measures_calculator(Y_iris_pred_kmeans, Y_iris_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_kmeans_iris["Algo"] = kmeans.__class__.__name__
perf_metrics_kmeans_iris["dataset"] = "iris"

perf_metrics_kmeans_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.888055,0.716342,0.886667,0.741912,1.0,3.0,KMeans,iris


### <span style="color:red">DBSCAN</span> <a class="anchor"></a>

In [12]:
from sklearn.cluster import DBSCAN

In [13]:
# Create an instance of the DBSCAN algorithm
dbscan = DBSCAN(eps=0.5, min_samples=5)

# Apply the DBSCAN algorithm to predict the clusters in the data
Y_iris_pred_dbscan = dbscan.fit_predict(X_iris)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_dbscan_iris = measures_calculator(Y_iris_pred_dbscan, Y_iris_true)
perf_metrics_dbscan_iris

# Store additional information about the algorithm and dataset in the result
perf_metrics_dbscan_iris["Algo"] = dbscan.__class__.__name__
perf_metrics_dbscan_iris["dataset"] = "iris"

perf_metrics_dbscan_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.724959,0.520619,0.686667,0.604416,1.0,3.0,DBSCAN,iris


### <span style="color:red">Agglomerative Clustering</span> <a class="anchor"></a>

In [14]:
from sklearn.cluster import AgglomerativeClustering

In [15]:
# Create an instance of the Agglomerative Clustering algorithm
agg_clustering = AgglomerativeClustering(n_clusters=3)

# Apply the Agglomerative Clustering algorithm to predict the clusters in the data
Y_iris_pred_agg = agg_clustering.fit_predict(X_iris)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_agg_iris = measures_calculator(Y_iris_pred_agg, Y_iris_true)
perf_metrics_agg_iris

# Store additional information about the algorithm and dataset in the result
perf_metrics_agg_iris["Algo"] = agg_clustering.__class__.__name__
perf_metrics_agg_iris["dataset"] = "iris"

perf_metrics_agg_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.895466,0.731199,0.893333,0.770084,1.0,3.0,AgglomerativeClustering,iris


### <span style="color:red">Gaussian Mixture Models (GMM)</span> <a class="anchor"></a>

In [16]:
from sklearn.mixture import GaussianMixture

In [17]:
# Create an instance of the Gaussian Mixture Models algorithm
gmm = GaussianMixture(n_components=3, random_state=0)

# Fit the Gaussian Mixture Models algorithm to the data and predict the clusters
gmm.fit(X_iris)
Y_iris_pred_gmm = gmm.predict(X_iris)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_gmm_iris = measures_calculator(Y_iris_pred_gmm, Y_iris_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_gmm_iris["Algo"] = gmm.__class__.__name__
perf_metrics_gmm_iris["dataset"] = "iris"

perf_metrics_gmm_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.96675,0.903874,0.966667,0.899694,1.0,3.0,GaussianMixture,iris


### <span style="color:blue">Benchmark Results</span> <a class="anchor"></a>

In [18]:
full_perf_metrics_iris = pd.concat([perf_metrics_git_iris, 
                                    perf_metrics_hdbscan_iris,
                                    perf_metrics_speclustr_iris,
                                    perf_metrics_speclustr_iris,
                                    perf_metrics_kmeans_iris,
                                    perf_metrics_dbscan_iris,
                                    perf_metrics_agg_iris,
                                    perf_metrics_gmm_iris], ignore_index=True)
full_perf_metrics_iris

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.883153,0.706006,0.88,0.756087,1.0,3.0,GIT,iris
1,0.775507,0.562136,0.686667,0.713099,1.0,3.0,HDBSCAN,iris
2,0.901225,0.743683,0.9,0.766036,1.0,3.0,SpectralClustering,iris
3,0.901225,0.743683,0.9,0.766036,1.0,3.0,SpectralClustering,iris
4,0.888055,0.716342,0.886667,0.741912,1.0,3.0,KMeans,iris
5,0.724959,0.520619,0.686667,0.604416,1.0,3.0,DBSCAN,iris
6,0.895466,0.731199,0.893333,0.770084,1.0,3.0,AgglomerativeClustering,iris
7,0.96675,0.903874,0.966667,0.899694,1.0,3.0,GaussianMixture,iris


## <center><a><span style="color:green">Real Datasets - `Wine`</span></a></center>

### Load the Wine data

In [19]:
X_wine, Y_wine_true = Real_DataLoader(name='wine', path=real_datasets_path).load()
print(f'The total number of Clusters in the Wine Dataset is: {len(np.unique(Y_wine_true))}, and has {len(X_wine[0])} Features')

The total number of Clusters in the Wine Dataset is: 3, and has 13 Features


### <span style="color:red">GIT Clustring</span> <a class="anchor"></a>

In [20]:
from git_cluster import GIT

In [21]:
# Create an instance of the GIT clustering
git = GIT(k=20, target_ratio=[1, 1, 1])

# Apply the GIT algorithm to predict the clusters in the data
Y_wine_pred_git = git.fit_predict(X_wine)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_git_wine = measures_calculator(Y_wine_pred_git, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_git_wine["Algo"] = git.__class__.__name__
perf_metrics_git_wine["dataset"] = "wine"

perf_metrics_git_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.900735,0.713308,0.898876,0.75702,1.0,3.0,GIT,wine


### <span style="color:red">HDBSCAN</span> <a class="anchor"></a>

In [22]:
import hdbscan

In [23]:
# Create an instance of the HDBSCAN clustering
hdbscan = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=2, gen_min_span_tree=True)

# Apply the HDBSCAN algorithm to predict the clusters in the data
hdbscan.fit(X_wine)

# Get the predicted Clusters
Y_wine_pred_hdbscan = hdbscan.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_hdbscan_wine = measures_calculator(Y_wine_pred_hdbscan, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_hdbscan_wine["Algo"] = hdbscan.__class__.__name__
perf_metrics_hdbscan_wine["dataset"] = "wine"

perf_metrics_hdbscan_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.678477,0.29075,0.606742,0.402635,1.0,3.0,HDBSCAN,wine


### <span style="color:red">Spectral Clustering</span> <a class="anchor"></a>

In [24]:
from sklearn.cluster import SpectralClustering

In [25]:
# Create an instance of the Spectral Clustering algorithm 
spectral_clustering = SpectralClustering(n_clusters=3, assign_labels="discretize", 
                                         random_state=0, affinity='rbf')

# Apply the Spectral Clustering algorithm to predict the clusters in the data
spectral_clustering.fit(X_wine)

# Get the predicted Clusters
Y_wine_pred_speclustr = spectral_clustering.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_speclustr_wine = measures_calculator(Y_wine_pred_speclustr, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_speclustr_wine["Algo"] = spectral_clustering.__class__.__name__
perf_metrics_speclustr_wine["dataset"] = "wine"

perf_metrics_speclustr_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.466706,0.01346,0.432584,0.014041,1.0,3.0,SpectralClustering,wine


### <span style="color:red">K-Means</span> <a class="anchor"></a>

In [26]:
from sklearn.cluster import KMeans

In [27]:
# Create an instance of the K-means algorithm 
kmeans = KMeans(n_clusters=3, random_state=0)

# Apply the K-means algorithm to predict the clusters in the data
kmeans.fit(X_wine)

# Get the predicted Clusters
Y_wine_pred_kmeans = kmeans.labels_

# Calculate various clustering metrics to evaluate the performance
perf_metrics_kmeans_wine = measures_calculator(Y_wine_pred_kmeans, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_kmeans_wine["Algo"] = kmeans.__class__.__name__
perf_metrics_kmeans_wine["dataset"] = "wine"

perf_metrics_kmeans_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.689714,0.371114,0.702247,0.428757,1.0,3.0,KMeans,wine


### <span style="color:red">DBSCAN</span> <a class="anchor"></a>

In [28]:
from sklearn.cluster import DBSCAN

In [29]:
# Create an instance of the DBSCAN algorithm
dbscan = DBSCAN(eps=0.5, min_samples=5)

# Apply the DBSCAN algorithm to predict the clusters in the data
Y_wine_pred_dbscan = dbscan.fit_predict(X_wine)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_dbscan_wine = measures_calculator(Y_wine_pred_dbscan, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_dbscan_wine["Algo"] = dbscan.__class__.__name__
perf_metrics_dbscan_wine["dataset"] = "wine"

perf_metrics_dbscan_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.570281,0.0,0.398876,0.0,1.0,3.0,DBSCAN,wine


### <span style="color:red">Agglomerative Clustering</span> <a class="anchor"></a>

In [30]:
from sklearn.cluster import AgglomerativeClustering

In [31]:
# Create an instance of the Agglomerative Clustering algorithm
agg_clustering = AgglomerativeClustering(n_clusters=3)

# Apply the Agglomerative Clustering algorithm to predict the clusters in the data
Y_wine_pred_agg = agg_clustering.fit_predict(X_wine)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_agg_wine = measures_calculator(Y_wine_pred_agg, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_agg_wine["Algo"] = agg_clustering.__class__.__name__
perf_metrics_agg_wine["dataset"] = "wine"

perf_metrics_agg_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.686376,0.368402,0.696629,0.416077,1.0,3.0,AgglomerativeClustering,wine


### <span style="color:red">Gaussian Mixture Models (GMM)</span> <a class="anchor"></a>

In [32]:
from sklearn.mixture import GaussianMixture

In [33]:
# Create an instance of the Gaussian Mixture Models algorithm
gmm = GaussianMixture(n_components=3, random_state=0)

# Fit the Gaussian Mixture Models algorithm to the data and predict the clusters
gmm.fit(X_wine)
Y_wine_pred_gmm = gmm.predict(X_wine)

# Calculate various clustering metrics to evaluate the performance
perf_metrics_gmm_wine = measures_calculator(Y_wine_pred_gmm, Y_wine_true)

# Store additional information about the algorithm and dataset in the result
perf_metrics_gmm_wine["Algo"] = gmm.__class__.__name__
perf_metrics_gmm_wine["dataset"] = "wine"

perf_metrics_gmm_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.848792,0.607484,0.848315,0.582343,1.0,3.0,GaussianMixture,wine


### <span style="color:blue">Benchmark Results</span> <a class="anchor"></a>

In [34]:
full_perf_metrics_wine = pd.concat([perf_metrics_git_wine, 
                                    perf_metrics_hdbscan_wine,
                                    perf_metrics_speclustr_wine,
                                    perf_metrics_speclustr_wine,
                                    perf_metrics_kmeans_wine,
                                    perf_metrics_dbscan_wine,
                                    perf_metrics_agg_wine,
                                    perf_metrics_gmm_wine], ignore_index=True)
full_perf_metrics_wine

Unnamed: 0,f1,ARI,ACC,NMI,cover_rate,classes,Algo,dataset
0,0.900735,0.713308,0.898876,0.75702,1.0,3.0,GIT,wine
1,0.678477,0.29075,0.606742,0.402635,1.0,3.0,HDBSCAN,wine
2,0.466706,0.01346,0.432584,0.014041,1.0,3.0,SpectralClustering,wine
3,0.466706,0.01346,0.432584,0.014041,1.0,3.0,SpectralClustering,wine
4,0.689714,0.371114,0.702247,0.428757,1.0,3.0,KMeans,wine
5,0.570281,0.0,0.398876,0.0,1.0,3.0,DBSCAN,wine
6,0.686376,0.368402,0.696629,0.416077,1.0,3.0,AgglomerativeClustering,wine
7,0.848792,0.607484,0.848315,0.582343,1.0,3.0,GaussianMixture,wine


## Connect with me 🌐
<div align="center">
  <a href="https://www.linkedin.com/in/labrijisaad/">
    <img src="https://img.shields.io/badge/LinkedIn-%230077B5.svg?&style=for-the-badge&logo=linkedin&logoColor=white" alt="LinkedIn" style="margin-bottom: 5px;"/>
  </a>
  <a href="https://github.com/labrijisaad">
    <img src="https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white" alt="GitHub" style="margin-bottom: 5px;"/>
  </a>
</div>