### Perform Principal component analysis and perform clustering using first 3 principal component scores (both heirarchial and k mean clustering(scree plot or elbow curve) and obtain optimum number of clusters and check whether we have obtained same number of clusters with the original data (class column we have ignored at the begining who shows it has 3 clusters)df

In [None]:
# Cell 1: Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score as sil, calinski_harabasz_score as chs, silhouette_samples

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Cell 2: Import Dataset
wine_data = pd.read_csv('wine.csv')
wine_data.head()

In [None]:
# Cell 3: Data Understanding
print(wine_data.Type.unique())
print(wine_data.Type.value_counts())

wine_data['ID'] = range(1, 179, 1)
print(wine_data.describe())
print(wine_data.info())
print(wine_data.isnull().sum())
print(wine_data.skew())

In [None]:
# Cell 4: Exploratory Data Analysis (EDA)
sns.set(style='dark', font_scale=1.3, rc={'figure.figsize': (14, 14)})
ax = wine_data.hist(bins=20, color='blue')

In [None]:
# Cell 5: Outliers Detection
outlier = wine_data.copy()
fig, axes = plt.subplots(13, 1, figsize=(13, 22), sharex=False, sharey=False)
sns.boxplot(x='Alcohol', data=outlier, palette='crest', ax=axes[0])
sns.boxplot(x='Malic', data=outlier, palette='crest', ax=axes[1])
sns.boxplot(x='Ash', data=outlier, palette='crest', ax=axes[2])
sns.boxplot(x='Alcalinity', data=outlier, palette='crest', ax=axes[3])
sns.boxplot(x='Magnesium', data=outlier, palette='crest', ax=axes[4])
sns.boxplot(x='Phenols', data=outlier, palette='crest', ax=axes[5])
sns.boxplot(x='Flavanoids', data=outlier, palette='crest', ax=axes[6])
sns.boxplot(x='Nonflavanoids', data=outlier, palette='crest', ax=axes[7])
sns.boxplot(x='Proanthocyanins', data=outlier, palette='crest', ax=axes[8])
sns.boxplot(x='Color', data=outlier, palette='crest', ax=axes[9])
sns.boxplot(x='Hue', data=outlier, palette='crest', ax=axes[10])
sns.boxplot(x='Dilution', data=outlier, palette='crest', ax=axes[11])
sns.boxplot(x='Proline', data=outlier, palette='crest', ax=axes[12])
plt.tight_layout(pad=2.0)

In [None]:
# Cell 6: Correlation Heatmap
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(wine_data.corr(), annot=True, linewidths=.5, fmt='.1f', ax=ax)
plt.show()

sns.set(style='white', rc={'figure.figsize': (9, 6)}, font_scale=1.1)
plt.scatter(x=wine_data['Phenols'], y=wine_data['Flavanoids'], color='blue', lw=0.1)
plt.xlabel('Phenols', fontweight='bold', fontsize=16)
plt.ylabel('Flavanoids', fontweight='bold', fontsize=16)
plt.title('Data represented by Positively Correlated Features', fontweight='bold', fontsize=18)
plt.show()

In [None]:
# Cell 7: Data Preprocessing
wine_df = wine_data.set_index('ID')
standard_scaler = StandardScaler()
std_wine = standard_scaler.fit_transform(wine_data)

minmax = MinMaxScaler()
norm_wine = minmax.fit_transform(wine_data)

In [None]:
# Cell 8: PCA (Principal Component Analysis) on Standard Scaled Dataset
pca_var = PCA()
pca_var.fit(std_wine)

# Plot
plt.figure(figsize=(10, 5))
xi = np.arange(1, 1 + std_wine.shape[1], step=1)
yi = np.cumsum(pca_var.explained_variance_ratio_)
plt.plot(xi, yi, marker='o', linestyle='--', color='b')

# Aesthetics
plt.ylim(0.0, 1.1)
plt.xlabel('Number of Components', fontweight='bold', fontsize=16)
plt.xticks(np.arange(1, 1 + std_wine.shape[1], step=1))
plt.ylabel('Cumulative variance (%)', fontweight='bold', fontsize=16)
plt.title('Explained variance by each component', fontweight='bold', fontsize=18)
plt.axhline(y=1, color='r', linestyle='-')
plt.gca().xaxis.grid(False)

# The amount of variance that each PCA explains is
var = pca_var.explained_variance_ratio_
plt.bar(range(1, len(var) + 1), var)
plt.xlabel('Number of Components', fontweight='bold', fontsize=16)
plt.ylabel('variance (%)', fontweight='bold', fontsize=16)
plt.title('Explained variance by each component', fontweight='bold', fontsize=18)
plt.show()

# Cumulative Variance
var1 = np.cumsum(np.round(var, decimals=4) * 100)

pca = PCA(n_components=3)
pca_std = pca.fit_transform(std_wine)

# Convert to data frame
pca_std_wine = pd.DataFrame(data=pca_std, columns=['PC1', 'PC2', 'PC3'])

# PCA plot in 2D
plt.figure(figsize=(9, 6))
plt.scatter(pca_std_wine.iloc[:, 0], pca_std_wine.iloc[:, 1], s=40)
plt.title('PCA plot in 2D using Strongest Principle Components', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

In [None]:
# Cell 9: PCA on MinMax Scaled Dataset
pca_var = PCA()
pca_var.fit(norm_wine)

# Plot
plt.figure(figsize=(10, 5))
xi = np.arange(1, 1 + norm_wine.shape[1], step=1)
yi = np.cumsum(pca_var.explained_variance_ratio_)
plt.plot(xi, yi, marker='o', linestyle='--', color='b')

# Aesthetics
plt.ylim(0.0, 1.1)
plt.xlabel('Number of Components', fontweight='bold', fontsize=16)
plt.xticks(np.arange(1, 1 + norm_wine.shape[1], step=1))
plt.ylabel('Cumulative variance (%)', fontweight='bold', fontsize=16)
plt.title('Explained variance by each component', fontweight='bold', fontsize=18)
plt.axhline(y=1, color='r', linestyle='-')
plt.gca().xaxis.grid(False)

var = pca_var.explained_variance_ratio_
plt.bar(range(1, len(var) + 1), var)
plt.xlabel('Number of Components', fontsize=16)
plt.ylabel('variance (%)', fontweight='bold', fontsize=16)
plt.title('Explained variance by each component', fontweight='bold', fontsize=18)
plt.show()

# Cumulative Variance
var1 = np.cumsum(np.round(var, decimals=4) * 100)

pca = PCA(n_components=3)
pca_norm = pca.fit_transform(norm_wine)

# Convert to data frame
pca_norm_wine = pd.DataFrame(data=pca_norm, columns=['PC1', 'PC2', 'PC3'])

# PCA plot in 2D
plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], s=40)
plt.title('PCA plot in 2D using Strongest Principle Components', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

In [None]:
# Cell 10: KMeans Clustering - Elbow Method for PCA Standard Scaled Dataset
inertia = []
for i in tqdm(range(2, 10)):
  kmeans = KMeans(n_clusters=i, init='k-means++', n_init=15, max_iter=500, random_state=17)
  kmeans.fit(pca_std_wine)
  inertia.append(kmeans.inertia_)

silhouette = {}
for i in tqdm(range(2, 10)):
  kmeans = KMeans(n_clusters=i, init='k-means++', n_init=15, max_iter=500, random_state=17)
  kmeans.fit(pca_std_wine)
  silhouette[i] = sil(pca_std_wine, kmeans.labels_, metric='euclidean')

sns.set(style="darkgrid", rc={'figure.figsize': (14, 6)}, font_scale=2)

plt.subplot(1, 2, 1)
plt.plot(range(2, len(inertia) + 2), inertia, marker="*", lw=2, color="skyblue", ms=10)
plt.xlabel("Number of clusters", fontweight='bold', fontsize=16)
plt.title("K-Means Inertia", fontweight='bold', fontsize=18)

plt.subplot(1, 2, 2)
plt.bar(range(len(silhouette)), list(silhouette.values()), align='center', edgecolor="black", lw=2, color="skyblue")
plt.xticks(range(len(silhouette)), list(silhouette.keys()))
plt.xlabel("Number of clusters", fontweight='bold', fontsize=16)
plt.title("Silhouette Score", fontweight='bold', fontsize=18)
plt.show()

model = KMeans(random_state=10, max_iter=500, init='k-means++')

# Instantiate the KElbowVisualizer with the number of clusters and the metric
visualizer = KElbowVisualizer(model, k=(2, 20), metric='silhouette', timings=False)
# Fit the data and visualize
print('Elbow Plot for Standard Scaler data')
visualizer.fit(pca_std_wine)
visualizer.show()
plt.show()

In [None]:
# Cell 11: KMeans Clustering - Build Model with K=3 for PCA Standard Scaled Dataset
model_kmeans = KMeans(n_clusters=3, random_state=0, init='k-means++')
y_predict_kmeans = model_kmeans.fit_predict(pca_std_wine)

# Assign clusters to the data set
wine_df['Kmeans_Label'] = model_kmeans.labels_

# Group data by Clusters (K=3)
wine_df.groupby('Kmeans_Label').agg(['mean'])

from yellowbrick.cluster import SilhouetteVisualizer

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=False)
fig.set_size_inches(14, 6)

sil_visualizer1 = SilhouetteVisualizer(model_kmeans, ax=ax1, colors=['#922B21', '#5B2C6F', '#1B4F72'])
sil_visualizer1.fit(pca_std_wine)

# 2nd Plot showing the actual clusters formed
colors1 = cm.nipy_spectral(model_kmeans.labels_.astype(float) / 3)  # 3 is number of clusters
ax2.scatter(pca_std_wine.iloc[:, 0], pca_std_wine.iloc[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors1, edgecolor='k')

# Labeling the clusters
centers1 = model_kmeans.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers1[:, 0], centers1[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')

for i, c in enumerate(centers1):
  ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

ax2.set_title(label="The visualization of the clustered data.", fontsize=14)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=14)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=14)
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
            "with n_clusters = %d" % 3), fontsize=16, fontweight='bold')

plt.show()

In [None]:
# Cell 12: KMeans Clustering - Elbow Method for PCA MinMax Scaled Dataset
inertia = []
for i in tqdm(range(2, 10)):
  kmeans = KMeans(n_clusters=i, init='k-means++', n_init=15, max_iter=500, random_state=17)
  kmeans.fit(pca_norm_wine)
  inertia.append(kmeans.inertia_)

silhouette = {}
for i in tqdm(range(2, 10)):
  kmeans = KMeans(n_clusters=i, init='k-means++', n_init=15, max_iter=500, random_state=17)
  kmeans.fit(pca_norm_wine)
  silhouette[i] = sil(pca_norm_wine, kmeans.labels_, metric='euclidean')

sns.set(style="darkgrid", rc={'figure.figsize': (14, 6)}, font_scale=2)

plt.subplot(1, 2, 1)
plt.plot(range(2, len(inertia) + 2), inertia, marker="*", lw=2, color="skyblue", ms=10)
plt.xlabel("Number of clusters", fontweight='bold', fontsize=16)
plt.title("K-Means Inertia", fontweight='bold', fontsize=18)

plt.subplot(1, 2, 2)
plt.bar(range(len(silhouette)), list(silhouette.values()), align='center', edgecolor="black", lw=2, color="skyblue")
plt.xticks(range(len(silhouette)), list(silhouette.keys()))
plt.xlabel("Number of clusters", fontweight='bold', fontsize=16)
plt.title("Silhouette score", fontweight='bold', fontsize=18)
plt.show()

wcss = []
for i in range(1, 9):
  kmeans = KMeans(n_clusters=i, random_state=2)
  kmeans.fit(pca_norm_wine)
  wcss.append(kmeans.inertia_)

# Plot K values range vs WCSS to get Elbow graph for choosing K (no. of clusters)
plt.plot(range(1, 9), wcss, color='black')
plt.scatter(range(1, 9), wcss, color='red')
plt.title('Elbow Graph for MinMaxScaler', fontweight='bold', fontsize=18)
plt.xlabel('Number of clusters', fontweight='bold', fontsize=16)
plt.ylabel('WCSS', fontweight='bold', fontsize=16)
plt.show()

model = KMeans(random_state=10, max_iter=500, init='k-means++')

# Instantiate the KElbowVisualizer with the number of clusters and the metric
visualizer = KElbowVisualizer(model, k=(2, 20), metric='silhouette', timings=False)
# Fit the data and visualize
print('Elbow Plot for MinMaxScaler data')
visualizer.fit(pca_norm_wine)
visualizer.show()
plt.show()

In [None]:
# Cell 13: KMeans Clustering - Build Model with K=3 for PCA MinMax Scaled Dataset
model_kmeans = KMeans(n_clusters=3, random_state=0, init='k-means++')
y_predict_kmeans = model_kmeans.fit_predict(pca_norm_wine)

# Assign clusters to the data set
wine_df['Kmeans_Label'] = model_kmeans.labels_

# Group data by Clusters (K=3)
wine_df.groupby('Kmeans_Label').agg(['mean'])

from yellowbrick.cluster import SilhouetteVisualizer

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=False)
fig.set_size_inches(14, 6)

sil_visualizer1 = SilhouetteVisualizer(model_kmeans, ax=ax1, colors=['#922B21', '#5B2C6F', '#1B4F72'])
sil_visualizer1.fit(pca_std_wine)

# 2nd Plot showing the actual clusters formed
colors1 = cm.nipy_spectral(model_kmeans.labels_.astype(float) / 2)  # 3 is number of clusters
ax2.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors1, edgecolor='k')

# Labeling the clusters
centers1 = model_kmeans.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers1[:, 0], centers1[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')

for i, c in enumerate(centers1):
  ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

ax2.set_title(label="The visualization of the clustered data.", fontsize=14)
ax2.set_xlabel("Feature space for the 1st feature", fontsize=14)
ax2.set_ylabel("Feature space for the 2nd feature", fontsize=14)
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
            "with n_clusters = %d" % 4), fontsize=16, fontweight='bold')

plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_df['Kmeans_Label'], cmap="brg", s=40)
plt.title('PCA plot in 2D', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

wine_df1 = wine_df.copy()
wine_df1['Cluster'] = model_kmeans.labels_

aux = wine_df1.columns.tolist()
aux[0:len(aux) - 1]
sns.set(style="darkgrid", rc={'figure.figsize': (15, 10)}, font_scale=1.2)

for cluster in aux[0:len(aux) - 1]:
  grid = sns.FacetGrid(wine_df1, col="Cluster")
  grid.map(plt.hist, cluster, color="skyblue", lw=1, edgecolor="black")

cluster1 = pd.DataFrame(wine_df1.loc[wine_df1.Cluster == 0].mean(), columns=['Cluster1_Avg'])
cluster2 = pd.DataFrame(wine_df1.loc[wine_df1.Cluster == 1].mean(), columns=['Cluster2_Avg'])
cluster3 = pd.DataFrame(wine_df1.loc[wine_df1.Cluster == 2].mean(), columns=['Cluster2_Avg'])
avg_wine = pd.concat([cluster1, cluster2, cluster3], axis=1)
avg_wine

In [None]:
# Cell 14: Comparing Kmeans Clusters with the Original Classified Dataset using "Type" Feature
plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_df['Kmeans_Label'], cmap="brg", s=40)
plt.title('After Kmeans Clustering on PCA MinMax Scaled Dataset', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_data['Type'], cmap="brg", s=40)
plt.title('Original Classification without Kmeans CLustering', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

# Group data by Clusters (K=3)
display('After Applying PCA and Kmens CLustering on Dataset', wine_df.groupby('Kmeans_Label').agg(['mean']),
      'Original Classified Dataset', wine_data.groupby('ID').agg(['mean']))

In [None]:
# Cell 15: t-SNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(norm_wine)

tsne_df = pd.DataFrame(data=X_tsne, columns=['t-SNE Comp. 1', 't-SNE Comp. 2'])

# t-SNE plot in 2D coloured by class
plt.figure(figsize=(9, 6))
plt.scatter(tsne_df.iloc[:, 0], tsne_df.iloc[:, 1], c=wine_df['Kmeans_Label'], cmap="brg", s=40)
plt.title('t-SNE plot in 2D', fontweight='bold', fontsize=18)
plt.xlabel('t-SNE Component 1', fontweight='bold', fontsize=16)
plt.ylabel('t-SNE Component 2', fontweight='bold', fontsize=16)
plt.show()

In [None]:
# Cell 16: Hierarchical Clustering Algorithm
for methods in ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']:
  plt.figure(figsize=(14, 6))
  dict = {'fontsize': 18, 'fontweight': 16, 'color': 'blue'}
  plt.title('Visualising the Data, Method- {}'.format(methods), fontdict=dict)
  Dendrogram1 = sch.dendrogram(sch.linkage(pca_norm_wine, method=methods, optimal_ordering=False))

In [None]:
# Cell 17: Silhouette Score method for PCA MinMax Scaled Data
# Applying Different Linkages using Euclidean Method for distance Calculation
n_clusters = [2, 3, 4, 5, 6, 7, 8]  # always start number from 2.

for n_clusters in n_clusters:
  for linkages in ["ward", "complete", "average", "single"]:
      hie_cluster1 = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkages)  # by default it takes linkage 'ward'
      hie_labels1 = hie_cluster1.fit_predict(pca_norm_wine)
      silhouette_score1 = sil(pca_norm_wine, hie_labels1)
      print("For n_clusters =", n_clusters, "The average silhouette_score with linkage-", linkages, ':', silhouette_score1)
  print()

# Applying Different Linkages using Different Distance Methods
n_clusters = [2, 3, 4, 5, 6, 7, 8]  # always start number from 2.

for n_clusters in n_clusters:
  for linkages in ["complete", "average", "single"]:
      for affinities in ["euclidean", "l1", "l2", "manhattan", "cosine"]:
          hie_cluster1 = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinities, linkage=linkages)
          hie_labels1 = hie_cluster1.fit_predict(pca_norm_wine)
          silhouette_score1 = sil(pca_norm_wine, hie_labels1)
          print("For n_clusters =", n_clusters, "The average silhouette_score with linkage-", linkages, "and Affinity-", affinities, ':', silhouette_score1)
      print()

In [None]:
# Cell 18: Silhouette Score method for PCA Standard Scaled Data
# Applying Different Linkages using Euclidean Method for distance Calculation
n_clusters = [2, 3, 4, 5, 6, 7, 8]  # always start number from 2.

for n_clusters in n_clusters:
  for linkages in ["ward", "complete", "average", "single"]:
      hie_cluster1 = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkages)
      hie_labels1 = hie_cluster1.fit_predict(pca_std_wine)
      silhouette_score1 = sil(pca_std_wine, hie_labels1)
      print("For n_clusters =", n_clusters, "The average silhouette_score with linkage-", linkages, ':', silhouette_score1)
  print()

# Applying Different Linkages using Different Distance Methods
n_clusters = [2, 3, 4, 5, 6, 7, 8]  # always start number from 2.

for n_clusters in n_clusters:
  for linkages in ["complete", "average", "single"]:
      for affinities in ["euclidean", "l1", "l2", "manhattan", "cosine"]:
          hie_cluster2 = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinities, linkage=linkages)
          hie_labels2 = hie_cluster2.fit_predict(pca_std_wine)
          silhouette_score2 = sil(pca_std_wine, hie_labels2)
          print("For n_clusters =", n_clusters, "The average silhouette_score with linkage-", linkages, "and Affinity-", affinities, ':', silhouette_score2)
      print()

In [None]:
# Cell 19: Run Hierarchical Clustering.(Agglomerative Clustering) For PCA on Standard Scaled Data
agg_clustering = AgglomerativeClustering(n_clusters=4, linkage='complete', affinity='l1')
y_pred_hie = agg_clustering.fit_predict(pca_std_wine)

# Putting Cluster labels into original dataset And analysis of the same
wine_df['Hierarchical_Labels'] = agg_clustering.labels_

wine_df.groupby('Hierarchical_Labels').agg(['mean'])

fig, ax = plt.subplots(figsize=(9, 6))
wine_df.groupby(['Hierarchical_Labels']).count()['Type'].plot(kind='bar')
plt.ylabel('ID Counts')
plt.title('Hierarchical Clustering PCA Standard Scaled Data', fontsize=18, fontweight='bold')
ax.set_xlabel('Clusters', fontweight='bold', fontsize=16)
ax.set_ylabel('ID counts', fontweight='bold', fontsize=16)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_std_wine.iloc[:, 0], pca_std_wine.iloc[:, 1], c=wine_df['Hierarchical_Labels'], cmap="brg", s=40)
plt.title('PCA plot in 2D', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_std_wine.iloc[:, 0], pca_std_wine.iloc[:, 1], c=wine_df['Hierarchical_Labels'], cmap="brg", s=40)
plt.title('After PCA Standard Scaler and Hierarchical Clustering plot in 2D', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_std_wine.iloc[:, 0], pca_std_wine.iloc[:, 1], c=wine_data['Type'], cmap="brg", s=40)
plt.title('Original Classification without Hierarchical CLustering', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

# Group data by Clusters (Clusters=4)
display('After Applying Hierarchical Clustering on PCA Standard Scaled Dataset', wine_df.groupby('Hierarchical_Labels').agg(['mean']),
      'Original Classified Dataset', wine_data.groupby('ID').agg(['mean']).head())

In [None]:
# Cell 20: Run Hierarchical Clustering.(Agglomerative Clustering) For PCA on MinMaxScaled Data
agg_clustering = AgglomerativeClustering(n_clusters=3, linkage='average', affinity='l2')
y_pred_hie = agg_clustering.fit_predict(pca_norm_wine)

# Putting Cluster labels into original dataset And analysis of the same
wine_df['Hierarchical_Labels'] = agg_clustering.labels_

wine_df.groupby('Hierarchical_Labels').agg(['mean'])

# Plotting barplot using groupby method to get visualize how many row no. in each cluster
fig, ax = plt.subplots(figsize=(9, 6))
wine_df.groupby(['Hierarchical_Labels']).count()['Type'].plot(kind='bar')
plt.ylabel('ID Counts')
plt.title('Hierarchical Clustering PCA MinMax Scaled Data', fontweight='bold', fontsize=18)
ax.set_xlabel('Clusters', fontweight='bold', fontsize=16)
ax.set_ylabel('ID counts', fontweight='bold', fontsize=16)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_df['Hierarchical_Labels'], cmap="brg", s=40)
plt.title('PCA plot in 2D', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

wine_df2 = wine_data.copy()
wine_df2['Cluster'] = model_kmeans.labels_

aux = wine_df2.columns.tolist()
aux[0:len(aux) - 1]

sns.set(style="darkgrid", rc={'figure.figsize': (15, 10)}, font_scale=1.2)
for cluster in aux[0:len(aux) - 1]:
  grid = sns.FacetGrid(wine_df2, col='Cluster')
  grid.map(plt.hist, cluster, color="skyblue", lw=1, edgecolor="black")

# Sorting elements based on cluster label assigned and taking average for insights.
cluster1 = pd.DataFrame(wine_df2.loc[wine_df2.Cluster == 0].mean(), columns=['Cluster1_Avg'])
cluster2 = pd.DataFrame(wine_df2.loc[wine_df2.Cluster == 1].mean(), columns=['Cluster2_Avg'])
cluster3 = pd.DataFrame(wine_df2.loc[wine_df2.Cluster == 2].mean(), columns=['Cluster2_Avg'])
avg_df = pd.concat([cluster1, cluster2, cluster3], axis=1)
avg_df

In [None]:
# Cell 21: Comparing Hierarchical Clusters with the Original Classified Dataset using "Type" Feature
plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_df['Hierarchical_Labels'], cmap="brg", s=40)
plt.title('After Hierarchical Clustering on PCA MinMax Scaled Dataset', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

plt.figure(figsize=(9, 6))
plt.scatter(pca_norm_wine.iloc[:, 0], pca_norm_wine.iloc[:, 1], c=wine_data['Type'], cmap="brg", s=40)
plt.title('Original Classification without Hierarchical CLustering', fontweight='bold', fontsize=18)
plt.xlabel('PC1', fontweight='bold', fontsize=16)
plt.ylabel('PC2', fontweight='bold', fontsize=16)
plt.show()

# Group data by Clusters (K=3)
display('After Applying Hierarchical CLustering on  PCA MinMax Scaled Dataset', wine_df.groupby('Hierarchical_Labels').agg(['mean']),
      'Original Classified Dataset', wine_data.groupby('ID').agg(['mean']).head())