In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("../data/Mall_Customers.csv")

In [None]:
clusters = [2, 3, 4, 5, 6]

data_numeric = data.drop(["CustomerID", "Gender"], axis=1)

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(data_scaled)

pca_df = pd.DataFrame(data=principal_components, columns=["PC1", "PC2"])

In [None]:
for num_clusters in clusters:
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(data_scaled)

    pca_df["Cluster"] = kmeans.labels_

    plt.figure(figsize=(10, 6))
    plt.scatter(
        pca_df["PC1"],
        pca_df["PC2"],
        c=pca_df["Cluster"],
        cmap="viridis",
        edgecolor="k",
        s=100,
    )
    plt.title(f"PCA Clustering with {num_clusters} clusters")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.colorbar(label="Cluster")
    plt.grid(True)
    plt.show()

    centroids = scaler.inverse_transform(kmeans.cluster_centers_)
    cluster_centroids_df = pd.DataFrame(
        centroids, columns=data_numeric.columns)
    print("Cluster Centroids:")
    print(cluster_centroids_df)

    feature_names = data_numeric.columns

    for i, component in enumerate(pca.components_):
        print(f" \n Principal Component {i+1}:")
        for j, weight in enumerate(component):
            print(f"{feature_names[j]}: {weight}")
        print("\n")

In [None]:
with open("../models/pca.pkl", "wb") as file:
    pickle.dump(kmeans, file)