<a href="https://colab.research.google.com/github/khushi2068/Clustering/blob/main/Clustering_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clustering-Assignment

In [13]:
!pip install ucimlrepo &> null
print("Installed ucimlrepo")
!pip install pycaret &>null
print("Installed pycaret")

Installed ucimlrepo
Installed pycaret


## Importing Dataset

In [36]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_failure_clinical_records = fetch_ucirepo(id=519)

# data (as pandas dataframes)
features= heart_failure_clinical_records.data.features
targets = heart_failure_clinical_records.data.targets

# metadata
print(heart_failure_clinical_records.metadata)

# variable information
print(heart_failure_clinical_records.variables)


{'uci_id': 519, 'name': 'Heart failure clinical records', 'repository_url': 'https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records', 'data_url': 'https://archive.ics.uci.edu/static/public/519/data.csv', 'abstract': 'This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 299, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['death_event'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5Z89R', 'creators': [], 'intro_paper': {'title': 'Machine learning can predict survival of patients with heart failure from serum creatinine and ejec

In [40]:
from pycaret.clustering import setup, create_model, plot_model
import pandas as pd

In [42]:
clustering_setup = setup(data=features, verbose=False)
selected_model_names = clustering_setup.models().index[[0, 3, 4]]

In [43]:
selected_model_names

Index(['kmeans', 'sc', 'hclust'], dtype='object', name='ID')

In [44]:
preprocessing_configs = {
    'No Preprocessing': {"remove_outliers": False, 'transformation': False, "normalize": False, 'pca': False},
    'Transformation': {"remove_outliers": False, 'transformation': True, "normalize": False, 'pca': False},
    'Normalization': {"remove_outliers": False, 'transformation': False, "normalize": True, 'pca': False},
    'PCA': {"remove_outliers": False, 'transformation': False, "normalize": False, 'pca': True},
    'Norm + PCA': {"remove_outliers": False, 'transformation': False, "normalize": True, 'pca': True},
    'Norm + PCA + TRANS': {"remove_outliers": False, 'transformation': True, "normalize": True, 'pca': True},
}

In [47]:
for model_name in selected_model_names:
    comparison_results = pd.DataFrame()
    for cluster_size in range(3, 6):
        for config_name, config_params in preprocessing_configs.items():
            current_setup = setup(data=features, verbose=False, **config_params)
            current_model = create_model(model_name, num_clusters=cluster_size, verbose=False)
            current_results = current_setup.pull()
            current_results['Configuration'] = config_name
            current_results['Cluster Size'] = cluster_size
            comparison_results = pd.concat([comparison_results, pd.DataFrame(current_results)], ignore_index=True)

    comparison_results.set_index(['Configuration', 'Cluster Size'], inplace=True)
    sorted_results = comparison_results.sort_index().T
    sorted_results.iloc[:3, :].to_csv(f'{model_name}.csv')
    print("    "+ model_name)
    display(sorted_results.iloc[:3, :])

    kmeans


Configuration,No Preprocessing,No Preprocessing,No Preprocessing,Norm + PCA,Norm + PCA,Norm + PCA,Norm + PCA + TRANS,Norm + PCA + TRANS,Norm + PCA + TRANS,Normalization,Normalization,Normalization,PCA,PCA,PCA,Transformation,Transformation,Transformation
Cluster Size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.5481,0.5535,0.5384,0.1036,0.1027,0.1031,0.0994,0.0999,0.0891,0.1186,0.1025,0.0966,0.5434,0.5545,0.5339,0.5495,0.571,0.6
Calinski-Harabasz,390.9995,499.6078,602.3528,29.581,26.7742,25.6776,30.9672,28.7517,24.9557,30.3955,27.4839,26.0819,391.1063,499.4446,602.9443,482.132,624.5236,809.2303
Davies-Bouldin,0.5652,0.5707,0.507,2.679,2.416,2.2835,2.666,2.425,2.4876,2.7874,2.2873,2.162,0.5708,0.5698,0.5062,0.5672,0.5106,0.4872


    sc


Configuration,No Preprocessing,No Preprocessing,No Preprocessing,Norm + PCA,Norm + PCA,Norm + PCA,Norm + PCA + TRANS,Norm + PCA + TRANS,Norm + PCA + TRANS,Normalization,Normalization,Normalization,PCA,PCA,PCA,Transformation,Transformation,Transformation
Cluster Size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,-0.4459,-0.4459,-0.4459,0.1014,0.099,0.0808,0.0722,0.0085,0.0225,0.1014,0.0981,0.0808,-0.4459,-0.4459,-0.4459,0.0554,0.2008,0.0442
Calinski-Harabasz,0.0,0.0,0.0,25.5215,24.0728,17.6941,18.6171,13.2809,14.3376,25.5215,24.1277,17.6941,0.0,0.0,0.0,7.6899,17.2691,9.6414
Davies-Bouldin,617.3178,617.3178,617.3178,2.0544,2.5158,1.9831,2.3254,2.0075,2.2676,2.0544,2.4984,1.9831,617.3178,617.3178,617.3178,0.4322,1.5155,36.1265


    hclust


Configuration,No Preprocessing,No Preprocessing,No Preprocessing,Norm + PCA,Norm + PCA,Norm + PCA,Norm + PCA + TRANS,Norm + PCA + TRANS,Norm + PCA + TRANS,Normalization,Normalization,Normalization,PCA,PCA,PCA,Transformation,Transformation,Transformation
Cluster Size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.5035,0.5395,0.541,0.0988,0.1051,0.0918,0.0795,0.0632,0.064,0.0988,0.1051,0.0918,0.5035,0.5395,0.541,0.5189,0.5408,0.5639
Calinski-Harabasz,322.0706,478.6348,558.5869,23.8497,23.1853,22.0964,23.5111,21.4581,19.9122,23.8497,23.1853,22.0964,322.0706,478.6348,558.5869,467.7175,497.6851,723.6225
Davies-Bouldin,0.556,0.5775,0.49,2.3797,2.1811,2.2433,2.7879,2.7318,2.624,2.3797,2.1811,2.2433,0.556,0.5775,0.49,0.6099,0.5251,0.4999
