### <span style="color:blue">Title: </span>  Calculating Silhouette Scores for Two and Three Clusters
### <span style="color:blue">Code Author:</span>  Jack Young
### <span style="color:blue">Date Created:</span>  11/24/2020

The aim of the following Python code is to replicate silhouette scores {s(i)} from the following reference.  The target scores are illustrated at figure 2 and figure 3 for clustering with k = 2 and k = 3, respectively.
### <span style="color:blue">Reference:</span>
[1] Peter J. Rousseeuw, *Silhouettes: A graphical aid to the interpretation and validation of cluster analysis*,
Journal of Computational and Applied Mathematics,Volume 20,1987,Pages 53-65,ISSN 0377-0427, 
https://doi.org/10.1016/0377-0427(87)90125-7.
(http://www.sciencedirect.com/science/article/pii/0377042787901257)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class NodeMgr:
    # class constructor
    def __init__(self, distanceDf, clusterDf):
        self.distanceDf = distanceDf             
        self.clusterDf = clusterDf
        self.report = {'score':[], 'node':[], 'clusterId':[]}
        
    # Main method to orchestrate calculation of the following four silhouette score types:
    # 1) a(i): average dissimilarity of element i to all other elements within the same cluster as element i 
    # 2) d(i): average dissimilarity of element i to all other elements within another cluster apart from element i
    # 3) b(i): the minimum value of d(i) (i.e., the nearest outside cluster to element i)
    # 4) s(i): silhouette clustering score for element i => s(i) = [b(i) - a(i)]/max{a(i), b(i)}
    def getScores(self):        
        clusterIds = np.unique(self.clusterDf['cluster'])
        for i in np.arange(0, clusterIds.shape[0]):
            currentClusterId = clusterIds[i]
            nodeNames = self.clusterDf[self.clusterDf['cluster'] == currentClusterId]['node']           
            outsideClusterIds = clusterIds[clusterIds != currentClusterId]    
            for nodeName in nodeNames:
                aScore = np.mean(self.getDistanceByClusterId(nodeName, currentClusterId))
                bScores = np.zeros(len(outsideClusterIds))       
                for j in np.arange(0, len(outsideClusterIds)):
                    bScores[j] = np.mean(self.getDistanceByClusterId(nodeName, outsideClusterIds[j]))       
                bScore = np.min(bScores)
                sScore = (bScore - aScore)/np.maximum(aScore,bScore)                 
                self.appendReport(sScore, nodeName,currentClusterId)
        return self.getFinalReport()
    
    # For a given element and cluster, this method retrieves the corresponding within and between cluster 
    # distance entries from the source proximity table (variable 'proxData' within the following cell).
    def getDistanceByClusterId(self, nodeName, clusterId):
        distances = pd.Series([], dtype=np.float64)
        distanceDfColumns = self.distanceDf.columns
        clusterNodes = self.clusterDf[self.clusterDf['cluster']==clusterId]['node']        
        indexClusterNodes = list(set(self.distanceDf.index).intersection(set(clusterNodes)))    
        columnClusterNodes = list(set(self.distanceDf.columns).intersection(set(clusterNodes)))       
        
        # For a given cluster, retrieves entries that intersect with the target element column 
        if nodeName in distanceDfColumns and len(indexClusterNodes) > 0:
            result1 = distances.append(self.distanceDf.loc[indexClusterNodes, nodeName])
            result1 = result1[result1.notna()]            
            if len(result1) > 0:
                distances = distances.append(result1)    
        
        # For a given cluster, retrieves entries that intersect with the target element row/index
        if nodeName in self.distanceDf.index and len(columnClusterNodes) > 0:
            result2 = self.distanceDf.loc[nodeName, columnClusterNodes]
            result2 = result2[result2.notna()]
            if len(result2) > 0:
                distances = distances.append(result2) 
        # Handle case if within cluster contains only one item
        if distances.empty:
            distances = distances.append(pd.Series([0]))
        return distances     
    
    # Append result to report
    def appendReport(self, score, nodeName, clusterId):
        self.report['score'].append(np.round(score,2))
        self.report['node'].append(nodeName)
        self.report['clusterId'].append(clusterId)
    
    # Generate final report by creating data frame and sorting values by cluster and s(i) score columns
    def getFinalReport(self):
        finalDf = pd.DataFrame(self.report)
        finalDf = finalDf.sort_values(by=['clusterId','score'], ascending=[True, False])
        return finalDf            

### <span style="color:blue">Source Proximity Table (values were extracted from reference [1])</span>

In [2]:
proxData = pd.DataFrame(data=np.array([
                       [5.58, np.nan, np.nan, np.nan,np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 
                       [7.00,6.50, np.nan, np.nan, np.nan,np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 
                       [7.08,7.00,3.83, np.nan, np.nan, np.nan,np.nan, np.nan, np.nan, np.nan,np.nan], 
                       [4.83,5.08,8.17,5.83, np.nan, np.nan, np.nan,np.nan, np.nan, np.nan,np.nan],
                       [2.17, 5.75, 6.67, 6.92, 4.92,np.nan, np.nan, np.nan,np.nan, np.nan,np.nan],
                       [6.42,5.00,5.58,6.00,4.67,6.42,np.nan, np.nan, np.nan,np.nan, np.nan],
                       [3.42,5.50,6.42,6.42,5.00,3.92,6.17,np.nan, np.nan, np.nan,np.nan],
                       [2.50,4.92,6.25,7.33,4.50,2.25,6.33,2.75,np.nan, np.nan, np.nan],
                       [6.08,6.67,4.25,2.67,6.00,6.17,6.17,6.92,6.17,np.nan, np.nan],
                       [5.25,6.83,4.50,3.75,5.75,5.42,6.08,5.83,6.67,3.67, np.nan],
                       [4.75,3.00,6.08,6.67,5.00,5.58,4.83,6.17,5.67,6.50,6.92]],
                      dtype=np.float64),
                        columns=['BEL', 'BRA', 'CHI', 'CUB', 'EGY','FRA','IND','ISR','USA','USS','YUG'],
                      index=['BRA', 'CHI', 'CUB', 'EGY','FRA','IND','ISR','USA','USS','YUG', 'ZAI'],)

print('SOURCE PROXIMITY TABLE:\n\n{}'.format(proxData))

SOURCE PROXIMITY TABLE:

      BEL   BRA   CHI   CUB   EGY   FRA   IND   ISR   USA   USS   YUG
BRA  5.58   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
CHI  7.00  6.50   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
CUB  7.08  7.00  3.83   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
EGY  4.83  5.08  8.17  5.83   NaN   NaN   NaN   NaN   NaN   NaN   NaN
FRA  2.17  5.75  6.67  6.92  4.92   NaN   NaN   NaN   NaN   NaN   NaN
IND  6.42  5.00  5.58  6.00  4.67  6.42   NaN   NaN   NaN   NaN   NaN
ISR  3.42  5.50  6.42  6.42  5.00  3.92  6.17   NaN   NaN   NaN   NaN
USA  2.50  4.92  6.25  7.33  4.50  2.25  6.33  2.75   NaN   NaN   NaN
USS  6.08  6.67  4.25  2.67  6.00  6.17  6.17  6.92  6.17   NaN   NaN
YUG  5.25  6.83  4.50  3.75  5.75  5.42  6.08  5.83  6.67  3.67   NaN
ZAI  4.75  3.00  6.08  6.67  5.00  5.58  4.83  6.17  5.67  6.50  6.92


###  <span style="color:blue">Calculate scores for two clusters</span>

In [3]:
clusters2 = pd.DataFrame({'node':['BEL', 'BRA','EGY', 'FRA', 'ISR','USA','ZAI', 'CHI','CUB','IND','USS', 'YUG'],
                          'cluster':[1,1,1,1,1,1,1,2,2,2,2,2]})
print('******TWO-CLUSTER CONFIGURATION*****\n{}\n'.format(clusters2))

obj2 = NodeMgr(proxData, clusters2)
dfScoresTwoClusters = obj2.getScores()

print('******Silhouette Scores for Two Clusters******\n{}'.format(dfScoresTwoClusters))

******TWO-CLUSTER CONFIGURATION*****
   node  cluster
0   BEL        1
1   BRA        1
2   EGY        1
3   FRA        1
4   ISR        1
5   USA        1
6   ZAI        1
7   CHI        2
8   CUB        2
9   IND        2
10  USS        2
11  YUG        2

******Silhouette Scores for Two Clusters******
    score node  clusterId
5    0.43  USA          1
0    0.39  BEL          1
3    0.35  FRA          1
4    0.30  ISR          1
1    0.22  BRA          1
2    0.20  EGY          1
6    0.19  ZAI          1
8    0.40  CUB          2
10   0.34  USS          2
7    0.33  CHI          2
11   0.26  YUG          2
9   -0.04  IND          2


###  <span style="color:blue">Calculate scores for three clusters</span>

In [4]:
clusters3 = pd.DataFrame({'node':['BEL', 'EGY','FRA', 'ISR','USA','BRA','IND','ZAI', 'CHI','CUB','USS', 'YUG'],
                          'cluster':[1,1,1,1,1,2,2,2,3,3,3,3]})
print('******THREE-CLUSTER CONFIGURATION*****\n{}\n'.format(clusters3))

obj3 = NodeMgr(proxData, clusters3)
dfScoresThreeClusters = obj3.getScores()

print('******Silhouette Scores for Three Clusters******\n{}'.format(dfScoresThreeClusters))

******THREE-CLUSTER CONFIGURATION*****
   node  cluster
0   BEL        1
1   EGY        1
2   FRA        1
3   ISR        1
4   USA        1
5   BRA        2
6   IND        2
7   ZAI        2
8   CHI        3
9   CUB        3
10  USS        3
11  YUG        3

******Silhouette Scores for Three Clusters******
    score node  clusterId
4    0.47  USA          1
2    0.44  FRA          1
0    0.42  BEL          1
3    0.37  ISR          1
1    0.02  EGY          1
7    0.28  ZAI          2
5    0.25  BRA          2
6    0.17  IND          2
9    0.48  CUB          3
10   0.44  USS          3
8    0.31  CHI          3
11   0.31  YUG          3
