In [2]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [3]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0
5,0,22.0,17000,76.0
6,0,35.0,18000,6.0
7,0,23.0,18000,94.0
8,1,64.0,19000,3.0
9,0,30.0,19000,72.0


In [4]:
# loop through 10 values for K and determine the inertia:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)

  f"KMeans is known to have a memory leak on Windows "


In [6]:
# create a plot for the elbow curve:
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [7]:
# create a K-means function again to reuse the K-means cluster

def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [9]:
# Run the function for K = 5
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100),class
0,1,19.0,15000,39.0,0
1,1,21.0,15000,81.0,0
2,0,20.0,16000,6.0,0
3,0,23.0,16000,77.0,0
4,0,31.0,17000,40.0,0


In [10]:
# Run the function for K = 6
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100),class
0,1,19.0,15000,39.0,5
1,1,21.0,15000,81.0,5
2,0,20.0,16000,6.0,5
3,0,23.0,16000,77.0,5
4,0,31.0,17000,40.0,5


In [11]:
# Plotting the 2D-Scatter for K = 5 with x="Annual Income" and y="Spending Score (1-100)"
five_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [14]:
# Plot the 3D-scatter for K = 5 with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_clusters,
    x="Age",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [15]:
# Plotting the 2D-Scatter for K = 6 with x="Annual Income" and y="Spending Score (1-100)"
six_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [16]:
# Plotting the 3D-Scatter for K = 6 with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    six_clusters,
    x="Age",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [17]:
# So do we use five or six groups? This depends on what insights you can take away from the data.
# One might conclude that six groups would be most useful because they could be broken down like so:

# Cluster 0: medium income, low annual spend
# Cluster 1: low income, low annual spend
# Cluster 2: high income, low annual spend
# Cluster 3: low income, high annual spend
# Cluster 4: medium income, high annual spend
# Cluster 5: very high income, high annual spend

# If we choose five groups, they would need to be different and would not fit into what you're looking for,
# which is grouping types of customers based on spending habits. Remember,
# unsupervised learning can help us make decisions about the data, up to a point,
# then it is up to you, the expert, to make the final call.