<a href="https://colab.research.google.com/github/jermanalopes/MachineLearningBasic/blob/main/BestClusterAlg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import accuracy_score, silhouette_score

In [46]:
#Function to compare clustering algorithms (Kmeans, Agglomerative and DBSCAN)
def compare_algorithms(X, max_cluster):
  results = []
  cluster_range = range(2, max_cluster + 1)

  #kmeans
  for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    predict_kmeans = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, predict_kmeans)
    #results.append([n_clusters, silhouette_avg])
    results.append(('Kmeans', n_clusters, silhouette_avg))

  #agglomerative
  for n_clusters in cluster_range:
    aglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    predict_aglomerative = aglomerative.fit_predict(X)
    silhouette_avg = silhouette_score(X, predict_aglomerative)
   # results.append([n_clusters, silhouette_avg])
    results.append(('Aglomerative', n_clusters, silhouette_avg))

  #DBSCAN
  eps_values = np.arange(0.1, 0.9, 0.1)
  for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    predict_dbscan = dbscan.fit_predict(X)
    if len(set(predict_dbscan))>1:
      silhouette_avg = silhouette_score(X, predict_dbscan)
      #results.append([eps, silhouette_avg])
      results.append(('DBSCAN', eps, silhouette_avg))

  return results




In [50]:
#Testing algorithms with Iris data
iris = datasets.load_iris()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(iris.data)

results = compare_algorithms(scaled_data, 10)
df_results = pd.DataFrame(results, columns=['Algorithm', 'Number of Clusters', 'Score'])
df_results


Unnamed: 0,Algorithm,Number of Clusters,Score
0,Kmeans,2.0,0.58175
1,Kmeans,3.0,0.463042
2,Kmeans,4.0,0.415113
3,Kmeans,5.0,0.391247
4,Kmeans,6.0,0.328478
5,Kmeans,7.0,0.327694
6,Kmeans,8.0,0.335579
7,Kmeans,9.0,0.352575
8,Kmeans,10.0,0.361405
9,Aglomerative,2.0,0.577035


In [56]:
max_score = df_results['Score'].idxmax()
print(df_results.loc[max_score])

Algorithm              Kmeans
Number of Clusters        2.0
Score                 0.58175
Name: 0, dtype: object
