# Clustering Notebook

In this notebook we will explore clustering methods on the survey answer data

In [2]:
import pandas as pd
data = pd.read_csv('../data/cleaned_data_v2.csv')

Filter the data

In [3]:
# Load scoring table
scoring = pd.read_excel('../scoring/scoring.xlsx')
survey_answer_cols = scoring['id'].tolist()
clustering_data = data[survey_answer_cols]

Scale the data

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(clustering_data)  # Standardize to mean=0, std=1


K-means

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters = kmeans.fit_predict(data_scaled)
data['kmeans_cluster'] = clusters

GMM

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=5, random_state=42)
gmm_clusters = gmm.fit_predict(data_scaled)
data['gmm_cluster'] = gmm_clusters

DBScan

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=1.5, min_samples=10)
dbscan_clusters = dbscan.fit_predict(data_scaled)
data['dbscan_cluster'] = dbscan_clusters


Cluster Evaluation

In [None]:
print(data['kmeans_cluster'].value_counts())
print(data['gmm_cluster'].value_counts())
print(data['dbscan_cluster'].value_counts())  # Check for -1 (outliers in DBSCAN)

In [None]:
from sklearn.metrics import silhouette_score

print("K-Means Silhouette Score:", silhouette_score(data_scaled, data['kmeans_cluster']))
print("GMM Silhouette Score:", silhouette_score(data_scaled, data['gmm_cluster']))
print("DBSCAN Silhouette Score:", silhouette_score(data_scaled, data['dbscan_cluster']))


Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)

plt.figure(figsize=(12, 4))

for i, method in enumerate(["kmeans_cluster", "gmm_cluster", "dbscan_cluster"]):
    plt.subplot(1, 3, i+1)
    plt.scatter(data_pca[:, 0], data_pca[:, 1], c=data[method], cmap='viridis', alpha=0.5)
    plt.title(method)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")

plt.tight_layout()
plt.show()
