In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist
import seaborn as sns


In [None]:
# Function to read the CSV file
survey_df = pd.read_csv('D:\\Assignments_SCMA632\\Data\\Survey.csv')

In [None]:
# Select the relevant columns for clustering
sur_int = survey_df.iloc[:, 19:46]  # Python uses 0-based indexing


In [None]:
# Function to find the optimal number of clusters using the Gap Statistic
def optimal_number_of_clusters(data, max_k):
    gaps = np.zeros((max_k,))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    
    for k in range(1, max_k+1):
        km = KMeans(n_clusters=k, n_init=25, random_state=123)
        km.fit(data)
        cluster_centers = km.cluster_centers_
        dispersions = np.sum(np.min(cdist(data, cluster_centers, 'euclidean'), axis=1)) / data.shape[0]
        gaps[k-1] = np.log(dispersions)
    
    gaps_diff = gaps[:-1] - gaps[1:]
    optimal_k = gaps_diff.argmax() + 1
    
    return optimal_k, gaps

In [None]:
# Find the optimal number of clusters
optimal_k, gaps = optimal_number_of_clusters(sur_int, 10)

plt.plot(range(1, 11), gaps, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Gap Statistic')
plt.title('Gap Statistic vs. Number of clusters')
plt.show()

In [None]:
# Perform KMeans clustering
km_res = KMeans(n_clusters=4, n_init=25, random_state=123)
km_res.fit(sur_int)
labels = km_res.labels_


In [None]:
# Plot the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x=sur_int.iloc[:, 0], y=sur_int.iloc[:, 1], hue=labels, palette='viridis')
plt.title('KMeans Clustering')
plt.show()

In [None]:
# Perform hierarchical clustering
linked = linkage(sur_int, method='ward') 

plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

In [None]:
# Create a heatmap
sns.clustermap(sur_int.T, method='ward', col_cluster=True, row_cluster=False, cmap='viridis')
plt.show()