# Calculate the Silhouette Score of a Dataset

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
np.random.seed(0)

In [12]:
# Import seed data
seeds = pd.read_csv('Seed_Data.csv')

In [13]:
# Seperate X features
X = seeds[['A','P','C','LK','WK','A_Coef','LKG']]

In [14]:
# K-means function
def k_means(X,K): 
    # Keep track of history so you can see K-Means in action 
    centroids_history = []
    labels_history = []
    rand_index = np.random.choice(X.shape[0], K) # Randomly selects K elements from the number of rows in X. 
    print(f"rand_index: {rand_index}")
    centroids = X[rand_index] # Creates an array of K rows randomly selected from X determined by rand_index 
    print(f"centroids: {centroids}")
    centroids_history.append(centroids) # Adds the new array to the history of centroids. 
    
    while True: 
        # Euclidean distance calculated for each point relative to centroids, then np.argmin returns 
        # index location of min distance = which cluster a point is assigned to
        # labels is an array where each element is the index of the nearest centroid for point in X
        labels = np.argmin(cdist(X,centroids), axis=1)
        print(f"labels: {labels}")
        labels_history.append(labels)
        # Take mean of points within clusters to find new centroids
        new_centroids = np.array([X[labels == i].mean(axis=0)
                                 for i in range(K)])
        print(f"new centroids: {new_centroids}")
        centroids_history.append(new_centroids)
        
        # If old centroids and new centroids no longer change, K-Means is complete and ends. 
        # Otherwise, continue
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return centroids, labels, centroids_history, labels_history

In [15]:
# Convert seeds X feature into a numpy matrix
X_mat = X.values

In [27]:
# Run k-means function on seeds matrix
centroids, labels, centroids_history, labels_hitory = k_means(X_mat, 3)

rand_index: [115 208 197]
centroids: [[19.06   16.45    0.8854  6.416   3.719   2.248   6.163 ]
 [11.84   13.21    0.8521  5.175   2.836   3.598   5.044 ]
 [13.37   13.78    0.8849  5.32    3.128   4.67    5.091 ]]
labels: [2 2 2 2 0 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 1 2 0 1 1 2 2 1 2 2 2 2 0 0
 0 2 2 2 1 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 2 2 2 1 0 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 2 2 2 2 0 2 2 2 2 2 2 2 1 1 1 2
 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 2 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 2 1 2
 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 1 2 1 1 1 2 1 2]
new centroids: [[18.49402985 16.19149254  0.8855403   6.17091045  3.70186567  3.37928507
   6.00156716]
 [11.82672131 13.2057377   0.85139016  5.20234426  2.84908197  3.80832951
   5.03580328]
 [14.1152439  14.23256098  0.87370366  5.50241463  3.20107317  3.88197561
   5.20007317]]
labels: [2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 0 2 1 2 

In [28]:
# Calculate the Silhouette Score for the Area and Length of the Kernel
silhouette_score(X[['A','LK']], labels)

0.5807182569954473

## Finding the appropriate K

## Elbow Method

In [None]:
# Pick a range of candidate values (e.g. 1 - 10)

# Calculate the average distance from centroids for each value

# Plot and find the "elbow", the point where there is a sharp drop in the graph. 

## Silhouette Method
### How well the clustering algorithm performed

In [None]:
# Pick a range of candidate values (e.g. 1 - 10)

# Plot silhouettes for each value of K

# Ideal value for Silhouette is 1, Worst value is -1 

# For any point i, calculate silhouette coefficient

# Find a(i) and b(i). Ideally a(i) < b(i). If a(i) > b(i), it is misclassified. 

# For any point i

# s(i) = b(i) - a(i) / max(b(i), a(i))