# K-means clustering
Create clusters (K-means) and visualize them.

Description of data https://www.kaggle.com/datasets/crawford/80-cereals?resource=download

In [None]:
import os

import sklearn.cluster as cluster

import pandas as pd
import plotly.express as px

%load_ext nb_black

In [None]:
DATA_FOLDER = "data"

In [None]:
df_all = pd.read_csv(os.path.join(DATA_FOLDER, "data_cereal.csv"))

## Content:
* Name: Name of cereal
* mfr: Manufacturer of cereal
    * A = American Home Food Products; G = General Mills; K = Kelloggs; N = Nabisco; P = Post; Q = Quaker Oats; R = Ralston Purina;
* type:
    * cold; hot
* calories: calories per serving
* protein: grams of protein
* fat: grams of fat
* sodium: milligrams of sodium
* fiber: grams of dietary fiber
* carbo: grams of complex carbohydrates
* sugars: grams of sugars
* potass: milligrams of potassium
* vitamins: vitamins and minerals - 0, 25, or 100, indicating the typical percentage of FDA recommended
* shelf: display shelf (1, 2, or 3, counting from the floor)
* weight: weight in ounces of one serving
* cups: number of cups in one serving
* rating: a rating of the cereals (Possibly from Consumer Reports?)

In [None]:
df_all.head()

In [None]:
df_all.info()

In [None]:
df_all.describe()

In [None]:
delete_list = ["name", "mfr", "type"]
matrix_list = [item for item in list(df_all.columns) if item not in delete_list]

In [None]:
fig = px.scatter_matrix(
    df_all,
    dimensions=matrix_list,
)
fig.show()

In [None]:
for col in matrix_list:
    fig = px.scatter(df_all, x="rating", y=col)
    fig.show()

## Intro:
* minimization of inertia (within-cluster sum-of-squares criterion)
$$ \sum_{n=0}^{n} \min_{\mu_j \in C}(\|x-\mu_j\|^2) $$
* initiation of clusters:
    * random initial centroids 
    * k-means++ - distant initial centroids
* drawback
    * algo assumes convex, isotropic clusters, works poorly on irregular shapes of clusters
    * “curse of dimensionality” - Inertia is not a normalized metric, tends to become inflated in multiple dimensions $ \Rightarrow $ PCA

## Try out of K-means clustering

In [None]:
from sklearn.preprocessing import scale

In [None]:
df_cluster = df_all.copy()
df_stnd = df_all.copy()

In [None]:
# Standardize numerical columns
num_cols = [
    col for col in df_stnd.columns if df_stnd[col].dtype in ["int64", "float64"]
]

df_stnd[num_cols] = df_stnd[num_cols].pipe(scale)

In [None]:
fig = px.scatter(df_cluster, x="sugars", y="sodium", title="Raw data")
fig.show()

fig = px.scatter(df_stnd, x="sugars", y="sodium", title="Standardized data")
fig.show()

In [None]:
# Apply Kmeans clustering with 3 clusters
km = cluster.KMeans(n_clusters=3)
km = km.fit(df_stnd[["sugars", "sodium"]])

df_stnd["cluster"] = km.labels_.astype("str")

if "cluster" in df_cluster.columns:
    if ~df_cluster["cluster"].equals(df_stnd["cluster"]):
        df_cluster["cluster"] = df_stnd["cluster"]
    else:
        pass
else:
    df_cluster = pd.concat([df_cluster, df_stnd["cluster"]], axis="columns")

In [None]:
fig = px.scatter(df_stnd, x="sugars", y="sodium", color="cluster")
fig.show()

## Evaluation of clustering
* run K-means clustering with n_clusters = 2-10
* evaluate each of the clustering
* choose the n_clusters with best performance

In [None]:
max_clusters = 10
inertia = []
centroids_list = []
n_clusters_list = list(range(1, max_clusters + 1))

for n in n_clusters_list:
    km = cluster.KMeans(n_clusters=n)
    km = km.fit(df_stnd[["sugars", "sodium"]])

    print(f"Number of clusters {n}, Inertia: {km.inertia_}")
    inertia.append(km.inertia_)

    labels_col = f"labels_n={n}"
    df_stnd[labels_col] = km.labels_.astype("str")

    centroids_list.append(km.cluster_centers_)

    scat = px.scatter(df_stnd, x="sugars", y="sodium", color=labels_col)
    scat.show()

### Elbow evaluation method

In [None]:
df_eval = pd.DataFrame(data=inertia, index=n_clusters_list, columns=["inertia"])
df_eval.index.name = "clusters"

df_eval["improvement"] = -df_eval["inertia"].diff() / df_eval["inertia"].shift()

display(df_eval)

scat = px.scatter(
    df_eval,
    x=df_eval.index,
    y="inertia",
)
scat.show()

impro = px.scatter(df_eval, x=df_eval.index, y="improvement")
impro.show()

### Silhouette Analysis
Helps to determine degree of separation between the clusters. For each sample:
* Compute the average distance from all data points in the same cluster (ai).
* Compute the average distance from all data points in the closest cluster (bi).
* Compute the silhouette coefficient:
$$ \frac{b_i-a_i}{\max{a_i,b_i)}} $$

The coefficient can take values in the interval [-1, 1]. It is:
* 0 –> the sample is very close to the neighboring clusters.
* 1 –> the sample is far away from the neighboring clusters.
* -1 –> the sample is assigned to the wrong clusters.

Well clustered datase has  silhouette average score well above 0.5.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import silhouette_samples

In [None]:
# Select n_clusters for silhouette analysis

average_scores = []
san = [2, 3, 4, 5]

# Reduce list of centroids for n_clusters for silhouette analysis
san_centroids_list = centroids_list[0:4]

for i, k in enumerate(san):
    # for i, k in enumerate([2]):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # Load labels and centroids for curren n_clusters
    labels_col = f"labels_n={k}"
    labels = df_stnd[labels_col]
    centroids = san_centroids_list[i]

    # Get silhouette samples
    silhouette_vals = silhouette_samples(df_stnd[["sugars", "sodium"]], labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0

    for i, clstr in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == clstr]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(
            range(y_lower, y_upper), cluster_silhouette_vals, edgecolor="none", height=1
        )
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    average_scores.append(avg_score)

    ax1.axvline(avg_score, linestyle="--", linewidth=2, color="green")
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel("Silhouette coefficient values")
    ax1.set_ylabel("Cluster labels")
    ax1.set_title("Silhouette plot for the various clusters", y=1.02)

    # Scatter plot of data colored with labels
    ax2.scatter(
        df_stnd["sugars"],
        df_stnd["sodium"],
        c=df_stnd[labels_col].values.astype(int),
    )
    ax2.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="r", s=250)
    ax2.set_xlim([-2, 2])
    ax2.set_xlim([-2, 2])
    ax2.set_xlabel("Sugars")
    ax2.set_ylabel("Sodium")
    ax2.set_title("Sugars-sodium clusters", y=1.02)
    ax2.set_aspect("equal")
    plt.tight_layout()
    plt.suptitle(
        f"Silhouette analysis using k = {k}, silhouette average score = {avg_score:.3f}, inertia = {df_eval['inertia'].loc[k]:.1f}",
        fontsize=16,
        fontweight="semibold",
        y=1.05,
    )

In [None]:
df_eval = pd.concat(
    [
        df_eval,
        pd.DataFrame(data=average_scores, index=san, columns=["silhouette avg score"]),
    ],
    axis="columns",
)

df_eval

# Check K-means method for evaluation of the clustering

## Cluster the data [Juraj]
* Using K-Means (Univariate, Multivariate)
* Evaluate quality of the clustering (inertia)
* Visualize the clustering