In [37]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

# CLEAN IRIS DATA

In [2]:
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
new_iris_df = new_iris_df[["sepal_length", "petal_length", "sepal_width", "petal_width"]]
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [5]:
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

# CLEAN SHOPPING DATA

In [7]:
file_path = "Resources/shopping_data.csv"
shopping_df = pd.read_csv(file_path, encoding="ISO-8859-1")
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [8]:
# Columns
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [9]:
# List dataframe datatypes
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [10]:
# Find null values
for column in shopping_df.columns: 
    print(f'column {column} has {shopping_df[column].isnull().sum()} null values')

column CustomerID has 0 null values
column Card Member has 2 null values
column Age has 2 null values
column Annual Income has 0 null values
column Spending Score (1-100) has 1 null values


In [11]:
# Drop null rows
shopping_df = shopping_df.dropna()


In [12]:
# Find duplicates
print (f'Duplicate entries: {shopping_df.duplicated().sum()}')

Duplicate entries: 0


In [13]:
# Removing customer id columns
shopping_df.drop(columns=['CustomerID'], inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [14]:
# transform string column
def change_string(member):
    if member == "Yes":
        return 1 
    else: 
        return 0 
    
shopping_df['Card Member'] = shopping_df['Card Member'].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [15]:
# tranform annual income
shopping_df['Annual Income'] = shopping_df['Annual Income']/1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [16]:
# rename columns of shopping_df (for ease of use)
shopping_df = shopping_df.rename(columns = {'Card Member': 'CardMember', 'Annual Income': 'AnnualIncome', 'Spending Score (1-100)': 'SpendingScore(1-100)'}, inplace = False)
shopping_df.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [17]:
# Saving cleaned data to CSV file format
file_path = "Resources/shopping_data_cleaned.csv"
shopping_df.to_csv(file_path, index=False)


# ANALYSIS WORK ON IRIS DATA

In [20]:
file_path = "Resources/new_iris_data.csv"
iris_df = pd.read_csv(file_path)
iris_df.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [23]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [26]:
# Fitting model
model.fit(iris_df)

KMeans(n_clusters=3, random_state=5)

In [28]:
predictions = model.predict(iris_df)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [29]:
# Add a new class column to the df_iris
iris_df["class"] = model.labels_
iris_df.head()


Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width,class
0,5.1,1.4,3.5,0.2,1
1,4.9,1.4,3.0,0.2,1
2,4.7,1.3,3.2,0.2,1
3,4.6,1.5,3.1,0.2,1
4,5.0,1.4,3.6,0.2,1


# Visualizing Iris Results

In [32]:
# Create a scatterplot of df_iris
iris_df.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")


In [36]:
# Plotting the clusters with three features
fig = px.scatter_3d(
    iris_df, 
    x="petal_width", 
    y="sepal_length", 
    z="petal_length", 
    color="class", 
    symbol="class", 
    size="sepal_width",
    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

# Trial and Error of Finding Centroids

In [38]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [40]:
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)")

In [42]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [44]:
test_cluster_amount(df_shopping, 2)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")

In [46]:
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [48]:
test_cluster_amount(df_shopping, 3)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [50]:
test_cluster_amount(df_shopping, 4)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [51]:
test_cluster_amount(df_shopping, 5)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [52]:
test_cluster_amount(df_shopping, 6)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [53]:
test_cluster_amount(df_shopping, 7)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

# Elbow Curve (using Iris data)

In [55]:
# Loading data
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)

df_iris.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [57]:
# list for holding inertia values
inertia = []
k = list(range(1, 11))

In [58]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [59]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

# Use the Elbow Curve (shopping data)

In [60]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [62]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [67]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks = k, title="Elbow Curve")

In [68]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data


In [69]:
five_cluster = get_clusters(5,df_shopping)
five_cluster.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100),class
0,1,19.0,15.0,39.0,0
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,0
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,0


In [71]:
six_cluster = get_clusters(6,df_shopping)
six_cluster.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100),class
0,1,19.0,15.0,39.0,5
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,5
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,5


In [72]:
# Plotting the 2D-Scatter
five_cluster.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)",by='class')

In [74]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_cluster,
    x="Age",
    y="SpendingScore(1-100)",
    z="AnnualIncome",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [75]:
# Plotting the 2D-Scatter
six_cluster.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)",by='class')

In [76]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    six_cluster,
    x="Age",
    y="SpendingScore(1-100)",
    z="AnnualIncome",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()