In [1]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn
import sys
import json
import os

In [18]:
def silhou_eva(X_scaled,outdir) :
    res = {}
    cluster_range = range(2,51)
    for n_clusters in cluster_range:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X_scaled) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters,max_iter=1000)
        cluster_labels = clusterer.fit_predict( X_scaled )

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X_scaled, cluster_labels,metric='euclidean')
        res[n_clusters] = silhouette_avg
        print("For n_clusters =", n_clusters,
            "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X_scaled, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                            0, ith_cluster_silhouette_values,
                            facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X_scaled[:, 0], X_scaled[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                  c=colors)

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1],
                  marker='o', c="white", alpha=1, s=200)

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                        "with n_clusters = %d" % n_clusters),
                            fontsize=14, fontweight='bold')
        plt.savefig('%s/sihouette_%s_cluster.png'%(outdir,n_clusters))  
        #plt.show()
    plt.clf()
    plt.close('all') 
    try :
        outfile =  open(os.path.join(outdir,os.path.basename('silhouette_score.json')),'w')
        outfile.write(json.dumps(res))
        outfile.close()
    except :
        outfile.close()
        outfile =  open(os.path.join(outdir,os.path.basename('silhouette_score.txt')),'w')
        outfile.write(str(res))
        outfile.close()

In [16]:
##### without PCA
X = np.load('../data/feature_extraction/sen_vector.npy')
outdir = '../data/silhou_evaluation_noPCA'
os.system('mkdir "..//data//silhou_evaluation_noPCA/"')
print('Sihou evaluation unscaled processing...')
silhou_eva(X,outdir)
scaler = StandardScaler()
X_scaled = scaler.fit_transform( X )
outdir = '../data/silhou_evaluation_scaled'
os.system('mkdir "..//data//silhou_evaluation_scaled_noPCA/"')
print('Sihou evaluation scaled processing...')
silhou_eva(X_scaled,outdir)

Sihou evaluation unscaled processing...
For n_clusters = 2 The average silhouette_score is : 0.32526234
For n_clusters = 3 The average silhouette_score is : 0.25989097
For n_clusters = 4 The average silhouette_score is : 0.27885577
Sihou evaluation scaled processing...
For n_clusters = 2 The average silhouette_score is : 0.28373754
For n_clusters = 3 The average silhouette_score is : 0.2626962
For n_clusters = 4 The average silhouette_score is : 0.252543


In [17]:
#### PCA
X = np.load('../data/PCA/sen_vector_PCA.npy')
outdir = '../data/silhou_evaluation'
os.system('mkdir "..//data//silhou_evaluation/"')
print('Sihou evaluation unscaled processing...')
silhou_eva(X,outdir)
scaler = StandardScaler()
X_scaled = scaler.fit_transform( X )
outdir = '../data/silhou_evaluation_scaled'
os.system('mkdir "..//data//silhou_evaluation_scaled/"')
print('Sihou evaluation scaled processing...')
silhou_eva(X_scaled,outdir)

Sihou evaluation unscaled processing...
For n_clusters = 2 The average silhouette_score is : 0.3603874584623517
For n_clusters = 3 The average silhouette_score is : 0.34561365705158714
For n_clusters = 4 The average silhouette_score is : 0.31255691739990427
Sihou evaluation scaled processing...
For n_clusters = 2 The average silhouette_score is : 0.15739322260632327
For n_clusters = 3 The average silhouette_score is : 0.1652766435055842
For n_clusters = 4 The average silhouette_score is : 0.16407890196160088
