In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns

#Standardize data
scaler = StandardScaler()
commuter_scaled = scaler.fit_transform(commuter_new)
#run K-Means for k=1 to 10

inertias = []
cluster_sizes = {}

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(commuter_scaled)
    inertias.append(kmeans.inertia_)
    if k > 1:
        counts = np.bincount(kmeans.labels_)
        cluster_sizes[k] = counts


#Elbow Method Plot

plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertias, marker='o', linestyle='--', color='blue')
plt.title('Elbow Method: Choose Your Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.grid(True)
plt.show()


#Display + visualize cluster sizes

print("\n🔹 Number of values per cluster:")
for k, sizes in cluster_sizes.items():
    print(f"k={k}: {sizes}")
    plt.figure(figsize=(5, 3))
    colors = sns.color_palette('Set3', len(sizes))
    plt.bar(range(1, len(sizes)+1), sizes, color=colors, edgecolor='black')
    plt.title(f"Cluster Sizes for k={k}")
    plt.xlabel("Cluster Label")
    plt.ylabel("Number of Samples")
    plt.xticks(range(1, len(sizes)+1))
    plt.tight_layout()
    plt.show()


#Let user choose k

while True:
    try:
        chosen_k = int(input("\n👉 Enter your chosen number of clusters (2–10): "))
        if 2 <= chosen_k <= 10:
            break
        else:
            print("⚠️ Please enter a value between 2 and 10.")
    except ValueError:
        print("⚠️ Invalid input. Please enter an integer.")


#Fit final model

kmeans_final = KMeans(n_clusters=chosen_k, random_state=42, n_init=10)
labels = kmeans_final.fit_predict(commuter_scaled)
centroids = kmeans_final.cluster_centers_


#Visualization (auto-handles >2D using PCA)

if commuter_scaled.shape[1] > 2:
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(commuter_scaled)
    centroids_2d = pca.transform(centroids)
    plt.figure(figsize=(7, 5))
    plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap='tab10', s=50, alpha=0.6)
    plt.scatter(centroids_2d[:, 0], centroids_2d[:, 1],
                c='red', marker='X', s=200, label='Centroids')
    plt.title(f'K-Means Clustering (k={chosen_k}) - PCA Visualization')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.legend()
    plt.show()

    #PCA Loadings

    loadings = pd.DataFrame(
        pca.components_.T,
        columns=['PCA1', 'PCA2'],
        index=commuter_new.select_dtypes(include=[np.number]).columns
    )

    print("\n📊 PCA Loadings — Feature Contributions per Principal Component:")
    display(loadings)

    # Optional: visualize PCA loadings
    plt.figure(figsize=(6, 4))
    sns.heatmap(loadings, annot=True, cmap='coolwarm', center=0)
    plt.title('PCA Loadings Heatmap')
    plt.show()

else:
    plt.figure(figsize=(7, 5))
    plt.scatter(commuter_scaled[:, 0], commuter_scaled[:, 1],
                c=labels, cmap='tab10', s=50, alpha=0.6)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                c='red', marker='X', s=200, label='Centroids')
    plt.title(f'K-Means Clustering (k={chosen_k})')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()
