# K-Means Clustering on Obesity Dataset

# 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


# Load the dataset

In [None]:
obesity_df = pd.read_csv("C:/ML/Labwork2/dataset/ObesityDataSet_raw_and_data_sinthetic.csv")

# Encode categorical features
categorical_cols = obesity_df.select_dtypes(include = ['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    obesity_df[col] = le.fit_transform(obesity_df[col])

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(obesity_df)


# 1. Experimental Protocol

In [None]:
inertias = []
silhouettes = []
k_values = list(range(2, 11))

for k in k_values:
    kmeans = KMeans(n_clusters = k, init = 'k-means++', random_state = 42)
    kmeans.fit(scaled_data)
    inertias.append(kmeans.inertia_)
    silhouette = silhouette_score(scaled_data, kmeans.labels_)
    silhouettes.append(silhouette)

# 2. Centroid Initialization
We used 'k-means++' for better initialization, which spreads out initial centroids.

# 3. Analyze results

In [None]:
plt.figure(figsize = (12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_values, inertias, marker = 'o')
plt.title('Elbow Method (Inertia)')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(k_values, silhouettes, marker = 's', color = 'green')
plt.title('Silhouette Score vs k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.savefig("C:/ML/Labwork2/visualizations/obesity_elbow_silhouette.png")
plt.show()


# 4. Choose best k

In [None]:
k_best = k_values[silhouettes.index(max(silhouettes))]
final_kmeans = KMeans(n_clusters = k_best, init = 'k-means++', random_state = 42)
obesity_df['Cluster'] = final_kmeans.fit_predict(scaled_data)

# Visualize clusters with PCA (optional for high-dimensional data)
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
components = pca.fit_transform(scaled_data)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=obesity_df['Cluster'], palette='Set2')
plt.title(f'Clustering Result with k={k_best}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.savefig("C:/ML/Labwork2/visualizations/obesity_pca_clusters.png")
plt.show()
