In [2]:
#initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [3]:
#load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,card_member,age,annual_income,spending_score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [4]:
df_shopping.hvplot.scatter(x= "annual_income", y = "spending_score")

In [5]:
#function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model
    
    #fitting model
    model.fit(df)
    
    #add a new class column to df_iris
    df['class'] = model.labels_

In [11]:
test_cluster_amount(df_shopping, 5)
df_shopping.hvplot.scatter(x= "annual_income", y = "spending_score", by="class")

In [12]:
fig = px.scatter_3d(
    df_shopping,
    x = "annual_income",
    y = "spending_score",
    z = "age",
    color = "class",
    symbol = "class",
    width = 800
)

fig.update_layout(legend=dict(x = 0, y = 1))
fig.show()

## Elbow Curve

In [13]:
inertia = []
k = list(range(1,11))

#calculate inertia for the range of k values
for i in k:
    km = KMeans(n_clusters = i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [14]:
#define a df to plot the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [15]:
def get_clusters(k, data):
    #create a copy of the df
    data = data.copy()
    
    #initialize kmeans
    model = KMeans(n_clusters=k, random_state=0)
    
    #fit the model
    model.fit(data)
    
    #predict clusters
    predictions = model.predict(data)
    
    #create return df with predicted clusters
    data['class'] = model.labels_
    
    return data

In [16]:
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

Unnamed: 0,card_member,age,annual_income,spending_score,class
0,1,19.0,15.0,39.0,0
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,0
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,0


In [19]:
#plot 2D graph
five_clusters.hvplot.scatter(x= "annual_income", y = "spending_score", by="class")

In [21]:
#plot 3D graph
fig = px.scatter_3d(
    five_clusters,
    x = "age",
    y = "spending_score",
    z = "annual_income",
    color = "class",
    symbol = "class",
    width = 800
)

fig.update_layout(legend=dict(x = 0, y = 1))
fig.show()

In [17]:
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

Unnamed: 0,card_member,age,annual_income,spending_score,class
0,1,19.0,15.0,39.0,3
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,3
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,3


In [20]:
#plot 2D graph
six_clusters.hvplot.scatter(x= "annual_income", y = "spending_score", by="class")

In [22]:
#plot 3D graph
fig = px.scatter_3d(
    six_clusters,
    x = "age",
    y = "spending_score",
    z = "annual_income",
    color = "class",
    symbol = "class",
    width = 800
)

fig.update_layout(legend=dict(x = 0, y = 1))
fig.show()