# Bank Dataset Clustering Algorithms Comparison

Involve 10 Models Clustering
 
<br>
<br>
<font color = 'blue'>
<b>Content: </b>

1. [Prepare Problems]
    * [Load Libraries](#2)
    * [Load Dataset](#3)    
1. [Models]
    * [K-Means](#4)
    * [Affinity Propagation](#5)
    * [BIRCH](#6)
    * [DBSCAN](#7)
    * [Mini Batch K-Means](#8)
    * [Mean Shift](#9)
    * [OPTICS](#10)
    * [Spectral Clustering](#11)
    * [Gaussian Mixture Model](#12)
    * [Agglomerative Clustering](#13)
1. [References](#14)

<a id = "2"></a><br>
## Load Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import unique
from numpy import where
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import OPTICS
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

<a id = "3"></a><br>
## Load Dataset

In [None]:
data = pd.read_csv('../data/segmentacion.csv', index_col=0)
data.head()

In [None]:
data.drop(['TotAmt'], axis=1, inplace=True)
data.drop(['TotTrans'], axis=1, inplace=True)

data.head()

In [None]:
from sklearn.decomposition import PCA
pca_bank = PCA(n_components=3)
principalComponents_bank = pca_bank.fit_transform(data)
principal_bank_Df = pd.DataFrame(data = principalComponents_bank
             , columns = ['principal component 1', 'principal component 2', 'principal component 3'])
principal_bank_Df.head()

<a id = "4"></a><br>
## 1 - K-Means 

In [None]:
k_means = KMeans(n_clusters=5)
k_means.fit(data)

### Labels

In [None]:
np.unique(k_means.labels_)

In [None]:
centers = k_means.cluster_centers_
centers

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=k_means.labels_, s=100)
plt.scatter(centers[:,0], centers[:,1], color='blue', marker='s', s=200) 
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('K-Means with 5 clusters')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
wscc = []
for i in range(4,15): 
    kmeans = KMeans(n_clusters=i, init="k-means++",random_state=0)
    kmeans.fit(data)
    wscc.append(kmeans.inertia_)  

plt.plot(range(4,15),wscc,marker="*",c="black")
plt.title("Elbow plot for optimal number of clusters")

### KMeans clustering with 10 clusters

In [None]:
k_means = KMeans(n_clusters=10)
k_means.fit(data)

In [None]:
np.unique(k_means.labels_)

In [None]:
centers = k_means.cluster_centers_

Displaying Data in 10 clusters 

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=k_means.labels_, s=100)
plt.scatter(centers[:,0], centers[:,1], color='blue', marker='s', s=200) 
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('K-Means with 10 clusters')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_kmeans = metrics.silhouette_score(data,k_means.labels_)

print("Score of K-Means = ", score_kmeans)

<a id = "5"></a><br>
## 2 - Affinity Propagation
Affinity Propagation involves finding a set of exemplars that best summarize the data.

In [None]:
model_aff = AffinityPropagation(damping=0.95)
model_aff.fit(data)
#
yhat_aff = model_aff.predict(data)
clusters_aff = unique(yhat_aff)
print("Clusters of Affinity Prop.",clusters_aff)
labels_aff = model_aff.labels_
centroids_aff = model_aff.cluster_centers_

In [None]:
plt.figure(figsize=(16, 9))

plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=labels_aff, s=100)
plt.scatter(centroids_aff[:,0], centroids_aff[:,1], color='red', marker='*', s=200) 
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Affinity Propagation')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_aff = metrics.silhouette_score(data,labels_aff)

print("Score of Affinity Propagation = ", score_aff)

<a id = "6"></a><br>
## 3 - BIRCH
BIRCH Clustering (BIRCH is short for Balanced Iterative Reducing and Clustering using
Hierarchies) involves constructing a tree structure from which cluster centroids are extracted.

In [None]:
model_br = Birch(threshold=0.01, n_clusters=10)
model_br.fit(data)
#
yhat_br = model_br.predict(data)
clusters_br = unique(yhat_br)
print("Clusters of Birch",clusters_br)
labels_br = model_br.labels_

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=model_br.labels_, s=100)
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('BIRCH with 10 clusters')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_br = metrics.silhouette_score(data,labels_br)

print("Score of Birch = ", score_br)

<a id = "7"></a><br>
## 4- DBSCAN

DBSCAN Clustering (where DBSCAN is short for Density-Based Spatial Clustering of Applications with Noise) involves finding high-density areas in the domain and expanding those areas of the feature space around them as clusters.

In [None]:
# dbscan clustering
from numpy import unique
from numpy import where
data_X = data.iloc[:,0:23].values

In [None]:
# define the model
model = DBSCAN(eps=0.7, min_samples=90)
# fit model and predict clusters
yhat = model.fit_predict(data_X)
# retrieve unique clusters
clusters = unique(yhat)

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=yhat, s=100)
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('DBSCAN Clustering')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_dbs = metrics.silhouette_score(data,yhat)

print("Score of DBSCAN = ", score_dbs)

<a id = "8"></a><br>
## 5 - Mini Batch K-Means

Mini-Batch K-Means is a modified version of k-means that makes updates to the cluster centroids using mini-batches of samples rather than the entire dataset, which can make it faster for large datasets, and perhaps more robust to statistical noise.

In [None]:
model_mini = MiniBatchKMeans(n_clusters=10)
model_mini.fit(data)
#
yhat_mini = model_mini.predict(data)
clusters_mini = unique(yhat_mini)
print("Clusters of Mini Batch KMeans.",clusters_mini)
labels_mini = model_mini.labels_
centroids_mini = model_mini.cluster_centers_

In [None]:
wscc = []
for i in range(4,15): 
    mkmeans = MiniBatchKMeans(n_clusters=i, init="k-means++",random_state=0)
    mkmeans.fit(data)
    wscc.append(mkmeans.inertia_)  

plt.plot(range(4,15),wscc,marker="*",c="black")
plt.title("Elbow plot for Mini Batch KMeans")

In [None]:
model_mini = MiniBatchKMeans(n_clusters=9)
model_mini.fit(data)
#
yhat_mini = model_mini.predict(data)
clusters_mini = unique(yhat_mini)
print("Clusters of Mini Batch KMeans.",clusters_mini)
labels_mini = model_mini.labels_
centroids_mini = model_mini.cluster_centers_

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=labels_mini, s=100)
plt.scatter(centroids_mini[:,0], centroids_mini[:,1], color='red', marker='*', s=200) 
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Mini Batch KMeans')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_mini = metrics.silhouette_score(data,labels_mini)

print("Score of Mini Batch = ", score_mini)

<a id = "9"></a><br>
## 6 - Mean Shift

Mean shift clustering involves finding and adapting centroids based on the density of examples in the feature space.

In [None]:
model_ms = MeanShift(bandwidth=25)
model_ms.fit(data)
#
yhat_ms = model_ms.predict(data)
clusters_ms = unique(yhat_ms)
print("Clusters of Mean Shift.",clusters_ms)
labels_ms = model_ms.labels_
centroids_ms = model_ms.cluster_centers_

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=labels_ms, s=100)
plt.scatter(centroids_ms[:,0], centroids_ms[:,1], color='red', marker='*', s=200) 
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Mean Shift')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_ms = metrics.silhouette_score(data,labels_ms)

print("Score of Mean Shift = ", score_ms)

<a id = "10"></a><br>
## 7 - OPTICS

OPTICS clustering (where OPTICS is short for Ordering Points To Identify the Clustering Structure) is a modified version of DBSCAN described above.


In [None]:
model_op = OPTICS(eps=0.8, min_samples=10)
#
yhat_op = model_op.fit_predict(data)
clusters_op = unique(yhat_op)
print("Clusters OPTICS.",clusters_op)
labels_op = model_op.labels_

In [None]:
score_op = metrics.silhouette_score(data,labels_op)

print("Score of Mean Shift = ", score_op)

<a id = "11"></a><br>
## 8 - Spectral Clustering

Spectral Clustering is a general class of clustering methods, drawn from linear algebra.

In [None]:
model_sc = SpectralClustering(n_clusters=10)
#
yhat_sc = model_sc.fit_predict(data)
clusters_sc = unique(yhat_sc)
print("Clusters Spectral",clusters_sc)
labels_sc = model_sc.labels_

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=labels_sc, s=100)
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Spectral Clustering')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_sc = metrics.silhouette_score(data,labels_sc)

print("Score of Spectral = ", score_sc)

<a id = "12"></a><br>
## 9 - Gaussian Mixture Model

A Gaussian mixture model summarizes a multivariate probability density function with a mixture of Gaussian probability distributions as its name suggests.

In [None]:
model_gb = GaussianMixture(n_components=10)
model_gb.fit(data)
#
yhat_gb = model_gb.predict(data)
clusters_gb = unique(yhat_gb)

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=yhat_gb, s=100)
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Gaussian Mixture Model')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_gmm = metrics.silhouette_score(data,yhat_gb)

print("Score of GMM = ", score_gmm)

<a id = "13"></a><br>
## 10 - Agglomerative Clustering

Agglomerative clustering involves merging examples until the desired number of clusters is achieved.

In [None]:
model_agg = AgglomerativeClustering(n_clusters=10)
#
yhat_agg = model_agg.fit_predict(data)
clusters_agg = unique(yhat_agg)
print("Clusters of Agglomerative Clustering",clusters_agg)
labels_agg = model_agg.labels_

In [None]:
plt.figure(figsize=(16, 9))
plt.scatter(principal_bank_Df['principal component 1'], 
            principal_bank_Df['principal component 2'], 
            c=labels_agg, s=100)
plt.xlabel('CP1')
plt.ylabel('CP2')
plt.title('Agglomerative Clustering')
plt.ylim(-1.2,1)
plt.xlim(-1,1.5)
plt.grid()
plt.show()

In [None]:
score_agg = metrics.silhouette_score(data,labels_agg)

print("Score of Agglomerative Clustering = ", score_agg)

<a id = "14"></a><br>
## References

* https://machinelearningmastery.com/clustering-algorithms-with-python/