In [1]:
!pip install ucimlrepo &> null
print("ucimlrepo installed successfully")

!pip install pycaret &> null
print("pycaret installed successfully")

ucimlrepo installed successfully
pycaret installed successfully


In [20]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Importing Glass Identification Dataset

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
glass_identification = fetch_ucirepo(id=42)

# data (as pandas dataframes)
X = glass_identification.data.features
y = glass_identification.data.targets

# metadata
print(glass_identification.metadata)

# variable information
print(glass_identification.variables)

{'uci_id': 42, 'name': 'Glass Identification', 'repository_url': 'https://archive.ics.uci.edu/dataset/42/glass+identification', 'data_url': 'https://archive.ics.uci.edu/static/public/42/data.csv', 'abstract': 'From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 214, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Type_of_glass'], 'index_col': ['Id_number'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5WW2P', 'creators': ['B. German'], 'intro_paper': None, 'additional_info': {'summary': 'Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is a product available through VRS Consulting, In

In [9]:
from pycaret.clustering import *
model = setup(X, verbose = False)

#### setup() function initializes the clustering environment and creates a transformation pipeline to prepare your data for further analysis and modeling.

In [7]:
models_list = model.models().Name.index
print(models_list)

Index(['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics',
       'birch'],
      dtype='object', name='ID')


#### model.models() calls a method on the model object that provides information about all supported clustering algorithms.

In [10]:
models_list = models_list[[0,2,4,5]]
print('Clustering Models Taken: ', models_list)

Clustering Models Taken:  Index(['kmeans', 'meanshift', 'hclust', 'dbscan'], dtype='object', name='ID')


In [14]:
parameters ={
    'No Data Processing': {'transformation': False, 'normalize': False, 'pca': False},
    'Using Normalisation': {'transformation': False, 'normalize': True, 'pca': False},
    'Using Transform': {'transformation': True, 'normalize': False, 'pca': False},
    'Using PCA': {'transformation': False, 'normalize': False, 'pca': True},
    'T+N': {'transformation': True, 'normalize': True, 'pca': False},
    'T+N+PCA': {'transformation': True, 'normalize': True, 'pca': True},
}

In [17]:
results = []

for model in models_list:
    model_results = pd.DataFrame()

    for cluster_size in range(3, 6):
        for name, args in parameters.items():
            exp = setup(X, verbose=False, **args)
            create_model(model, num_clusters=cluster_size, verbose=False)
            temp = exp.pull()

            temp['name'] = name
            temp['cluster_size'] = cluster_size

            model_results = pd.concat([model_results, temp], ignore_index=True)

    model_results.set_index(['name', 'cluster_size'], inplace=True)

    model_results_transposed = model_results.sort_index().T

    model_results_transposed.iloc[:3, :].to_csv(model + '.csv')

    print(model)
    display(model_results_transposed.iloc[:3, :])


kmeans


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.5822,0.5868,0.4441,0.3632,0.384,0.3238,0.3632,0.3005,0.3238,0.3672,0.391,0.3318,0.5822,0.5879,0.4472,0.5914,0.5878,0.5323
Calinski-Harabasz,134.998,121.4059,121.8531,70.2484,63.1173,63.1868,70.2485,63.5405,63.1868,58.288,60.0609,60.8193,134.998,122.1988,122.9427,260.0455,352.844,502.4628
Davies-Bouldin,0.9234,0.8475,0.8336,1.3286,1.2063,1.1086,1.3286,1.2135,1.1086,1.3602,1.0722,1.0509,0.9234,0.893,0.9544,0.594,0.5556,0.4913


meanshift


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.472,0.472,0.472,0.3349,0.3349,0.3349,0.3349,0.3349,0.3349,0.3507,0.3507,0.3507,0.472,0.472,0.472,0.527,0.527,0.527
Calinski-Harabasz,60.9168,60.9168,60.9168,19.9034,19.9034,19.9034,19.9034,19.9034,19.9034,24.8652,24.8652,24.8652,60.9168,60.9168,60.9168,141.0207,141.0207,141.0207
Davies-Bouldin,0.5625,0.5625,0.5625,0.7808,0.7808,0.7808,0.7808,0.7808,0.7808,0.6156,0.6156,0.6156,0.5625,0.5625,0.5625,0.4889,0.4889,0.4889


hclust


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.5838,0.5888,0.4581,0.3649,0.3843,0.3098,0.3649,0.3843,0.3098,0.4522,0.4523,0.259,0.5838,0.5888,0.4581,0.5917,0.6019,0.5365
Calinski-Harabasz,124.8313,116.568,115.5131,65.4346,59.2821,58.067,65.4347,59.2821,58.0671,50.0525,51.9503,52.9722,124.8314,116.5681,115.5133,255.2729,351.0101,441.6707
Davies-Bouldin,0.9799,0.8574,0.9688,1.381,1.2471,1.1962,1.381,1.2471,1.1962,1.1644,0.8431,1.0782,0.9799,0.8574,0.9688,0.5028,0.4255,0.471


dbscan


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2529,0.2529,0.2529,-0.1568,-0.1568,-0.1568,-0.1568,-0.1568,-0.1568,-0.0031,-0.0031,-0.0031,0.2529,0.2529,0.2529,0.0,0.0,0.0
Calinski-Harabasz,27.6361,27.6361,27.6361,3.4739,3.4739,3.4739,3.4739,3.4739,3.4739,13.5425,13.5425,13.5425,27.6362,27.6362,27.6362,0.0,0.0,0.0
Davies-Bouldin,1.3886,1.3886,1.3886,1.831,1.831,1.831,1.831,1.831,1.831,2.1117,2.1117,2.1117,1.3886,1.3886,1.3886,0.0,0.0,0.0
