In [127]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
import hvplot.pandas
# used when creating a dendrogram
import plotly.figure_factory as ff

# CLEAN IRIS DATA

In [128]:
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [129]:
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [130]:
new_iris_df = new_iris_df[["sepal_length", "petal_length", "sepal_width", "petal_width"]]
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [131]:
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

# CLEAN SHOPPING DATA

In [132]:
file_path = "Resources/shopping_data.csv"
shopping_df = pd.read_csv(file_path, encoding="ISO-8859-1")
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [133]:
# Columns
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [134]:
# List dataframe datatypes
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [135]:
# Find null values
for column in shopping_df.columns: 
    print(f'column {column} has {shopping_df[column].isnull().sum()} null values')

column CustomerID has 0 null values
column Card Member has 2 null values
column Age has 2 null values
column Annual Income has 0 null values
column Spending Score (1-100) has 1 null values


In [136]:
# Drop null rows
shopping_df = shopping_df.dropna()


In [137]:
# Find duplicates
print (f'Duplicate entries: {shopping_df.duplicated().sum()}')

Duplicate entries: 0


In [138]:
# Removing customer id columns
shopping_df.drop(columns=['CustomerID'], inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [139]:
# transform string column
def change_string(member):
    if member == "Yes":
        return 1 
    else: 
        return 0 
    
shopping_df['Card Member'] = shopping_df['Card Member'].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [140]:
# tranform annual income
shopping_df['Annual Income'] = shopping_df['Annual Income']/1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [141]:
# rename columns of shopping_df (for ease of use)
shopping_df = shopping_df.rename(columns = {'Card Member': 'CardMember', 'Annual Income': 'AnnualIncome', 'Spending Score (1-100)': 'SpendingScore(1-100)'}, inplace = False)
shopping_df.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [142]:
# Saving cleaned data to CSV file format
file_path = "Resources/shopping_data_cleaned.csv"
shopping_df.to_csv(file_path, index=False)


# ANALYSIS WORK ON IRIS DATA

In [143]:
file_path = "Resources/new_iris_data.csv"
iris_df = pd.read_csv(file_path)
iris_df.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [144]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [145]:
# Fitting model
model.fit(iris_df)

KMeans(n_clusters=3, random_state=5)

In [146]:
predictions = model.predict(iris_df)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [147]:
# Add a new class column to the iris_df
iris_df["class"] = model.labels_
iris_df.head()


Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width,class
0,5.1,1.4,3.5,0.2,1
1,4.9,1.4,3.0,0.2,1
2,4.7,1.3,3.2,0.2,1
3,4.6,1.5,3.1,0.2,1
4,5.0,1.4,3.6,0.2,1


# Visualizing Iris Results

In [148]:
# Create a scatterplot of iris_df
iris_df.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")


In [149]:
# Plotting the clusters with three features
fig = px.scatter_3d(
    iris_df, 
    x="petal_width", 
    y="sepal_length", 
    z="petal_length", 
    color="class", 
    symbol="class", 
    size="sepal_width",
    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

# Trial and Error of Finding Centroids

In [150]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [151]:
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)")

In [152]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to the dataframe
    df["class"] = model.labels_

In [153]:
test_cluster_amount(df_shopping, 2)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")

In [154]:
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [155]:
test_cluster_amount(df_shopping, 3)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [156]:
test_cluster_amount(df_shopping, 4)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [157]:
test_cluster_amount(df_shopping, 5)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [158]:
test_cluster_amount(df_shopping, 6)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [159]:
test_cluster_amount(df_shopping, 7)
df_shopping.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)", by="class")
fig = px.scatter_3d(
    df_shopping,
    x="AnnualIncome",
    y="SpendingScore(1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

# Elbow Curve (using Iris data)

In [160]:
# Loading data
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)

df_iris.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [161]:
# list for holding inertia values
inertia = []
k = list(range(1, 11))

In [162]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [163]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

# Use the Elbow Curve (shopping data)

In [164]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [165]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [166]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks = k, title="Elbow Curve")

In [167]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data


In [168]:
five_cluster = get_clusters(5,df_shopping)
five_cluster.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100),class
0,1,19.0,15.0,39.0,0
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,0
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,0


In [169]:
six_cluster = get_clusters(6,df_shopping)
six_cluster.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore(1-100),class
0,1,19.0,15.0,39.0,5
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,5
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,5


In [170]:
# Plotting the 2D-Scatter
five_cluster.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)",by='class')

In [171]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_cluster,
    x="Age",
    y="SpendingScore(1-100)",
    z="AnnualIncome",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [172]:
# Plotting the 2D-Scatter
six_cluster.hvplot.scatter(x="AnnualIncome", y="SpendingScore(1-100)",by='class')

In [173]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    six_cluster,
    x="Age",
    y="SpendingScore(1-100)",
    z="AnnualIncome",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

# PCA - Principal Component Analysis

In [174]:
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [175]:
# Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [176]:
# Initialize PCA model
pca = PCA(n_components=2)

In [177]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

iris_pca

array([[-2.26454173e+00,  5.05703903e-01],
       [-2.08642550e+00, -6.55404729e-01],
       [-2.36795045e+00, -3.18477311e-01],
       [-2.30419716e+00, -5.75367713e-01],
       [-2.38877749e+00,  6.74767397e-01],
       [-2.07053681e+00,  1.51854856e+00],
       [-2.44571134e+00,  7.45626750e-02],
       [-2.23384186e+00,  2.47613932e-01],
       [-2.34195768e+00, -1.09514636e+00],
       [-2.18867576e+00, -4.48629048e-01],
       [-2.16348656e+00,  1.07059558e+00],
       [-2.32737775e+00,  1.58587455e-01],
       [-2.22408272e+00, -7.09118158e-01],
       [-2.63971626e+00, -9.38281982e-01],
       [-2.19229151e+00,  1.88997851e+00],
       [-2.25146521e+00,  2.72237108e+00],
       [-2.20275048e+00,  1.51375028e+00],
       [-2.19017916e+00,  5.14304308e-01],
       [-1.89407429e+00,  1.43111071e+00],
       [-2.33994907e+00,  1.15803343e+00],
       [-1.91455639e+00,  4.30465163e-01],
       [-2.20464540e+00,  9.52457317e-01],
       [-2.77416979e+00,  4.89517027e-01],
       [-1.

In [178]:
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [179]:
# Use explained_variance_ratio to learn how much information can be attributed to each principal component
pca.explained_variance_ratio_
# NOTE: What this tells us, is that the first principal component contains 72.77%
#                            of the variance 
#     and the second contains 23.03%. 
#     Together, they contain 95.80% of the information.

array([0.72770452, 0.23030523])

In [180]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [181]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [182]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

# Running Hierarchical Clustering

In [183]:
# Starting back at getting PCA so we can run hierarchical clustering: 


file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head()

# Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)
iris_pca

# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

# Use explained_variance_ratio to learn how much information can be attributed to each principal component
pca.explained_variance_ratio_


[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


array([0.72770452, 0.23030523])

In [184]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()


In [185]:
# Run the hierarchical algorithm
# NOTE: Agglomerative clustering is another name for hierarchical clustering.
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [186]:
# Add the predicted class columns to df_iris
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [187]:
# plot to show the results of the hierarchical clustering algorithm
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

NOTE THERE ISN'T VERY MUCH DIFFERENCE BETWEEN K-mean and Hierarchical Clustering (for this exercise!!!)


# K-means vs. Hierarchical Clustering
Hierarchical clustering seems like a fairly interesting idea, but you wonder what the differences are between K-means and hierarchical.
### The K-means algorithm is the main algorithm we used in this module. 

It is easy, runs relatively quickly, and can scale to large datasets.
This is not to say there aren't drawbacks to the K-means algorithm.
Behind the scenes, K-means is dependent on random initialization, so the outcome depends on a random seed.
With K-means, you need to have an idea of how many clusters you're looking for ahead of time,                                       which might not always be known. 
This can be an issue when the points of data are not so clearly grouped into clusters, as 
K-means works best for spherical-looking data with similar density points closely grouped together.

### With hierarchical clustering and the use of dendrograms, 
... it's easier to pick how many clusters we want without making any assumptions since a K value does not need to be known ahead of time. The dendrogram might not always create as clear of a choice as we would like, and it leaves the final decision up to the analyst. 
With the iris dataset, we knew the K value ahead of time, so using K-means in that situation would make more sense. Hierarchical clustering might not work as well on larger datasets because it is slower at run time, and there are a lot of decisions to be made about when to merge groups of clusters.

### NOTE: 
Both clustering algorithms have their pros and cons. Read https://en.wikipedia.org/wiki/No_free_lunch_theorem, which states that there will always be times when one algorithm outperforms the other, and vice versa.

