Create clusters (K-means) and visualize them.

Description of data https://www.kaggle.com/datasets/crawford/80-cereals?resource=download

In [None]:
import os

import sklearn.cluster as cluster

import pandas as pd
import plotly.express as px

%load_ext nb_black

In [None]:
DATA_FOLDER = "data"

In [None]:
df_all = pd.read_csv(os.path.join(DATA_FOLDER, "data_cereal.csv"))

## Content:
* Name: Name of cereal
* mfr: Manufacturer of cereal
    * A = American Home Food Products; G = General Mills; K = Kelloggs; N = Nabisco; P = Post; Q = Quaker Oats; R = Ralston Purina;
* type:
    * cold; hot
* calories: calories per serving
* protein: grams of protein
* fat: grams of fat
* sodium: milligrams of sodium
* fiber: grams of dietary fiber
* carbo: grams of complex carbohydrates
* sugars: grams of sugars
* potass: milligrams of potassium
* vitamins: vitamins and minerals - 0, 25, or 100, indicating the typical percentage of FDA recommended
* shelf: display shelf (1, 2, or 3, counting from the floor)
* weight: weight in ounces of one serving
* cups: number of cups in one serving
* rating: a rating of the cereals (Possibly from Consumer Reports?)

In [None]:
df_all.head()

In [None]:
df_all.info()

In [None]:
df_all.describe()

In [None]:
delete_list = ["name", "mfr", "type"]
matrix_list = [item for item in list(df_all.columns) if item not in delete_list]

In [None]:
fig = px.scatter_matrix(df_all, dimensions=matrix_list)
fig.show()

In [None]:
# for col in matrix_list:
#     fig = px.scatter(df_all, x="rating", y=col)
#     fig.show()

In [None]:
kmeans = cluster.KMeans(n_clusters=2)
kmeans = kmeans.fit(df_all[["sugars", "rating"]])

In [None]:
# kmeans.cluster_centers_

In [None]:
df_all["Cluster"] = kmeans.labels_.astype("str")

In [None]:
# df_all.info()

In [None]:
# df_all["Cluster"].value_counts()

In [None]:
fig = px.scatter(
    df_all,
    x="rating",
    y="sugars",
    color="Cluster",
)
fig.show()

# Intro:
* minimization of inertia (within-cluster sum-of-squares criterion)
$$ \sum_{n=0}^{n} \min_{\mu_j \in C}(\|x-\mu_j\|^2) $$
* initiation of clusters:
    * random initial centroids 
    * k-means++ - distant initial centroids
* drawback
    * algo assumes convex, isotropic clusters, works poorly on irregular shapes of clusters
    * “curse of dimensionality” - Inertia is not a normalized metric, tends to become inflated in multiple dimensions $ \Rightarrow $ PCA

## Evaluate single variate clustering
* find a method for clustering evaluation

In [None]:
df_cluster = df_all.copy()
df_cluster["zero"] = 0

In [None]:
clusters = 10
inertia = []

for n in list(range(1, clusters)):
    kmeans_single = cluster.KMeans(n_clusters=n)
    kmeans_single = kmeans_single.fit(df_cluster[["rating"]])

    print(f"Number of clusters {n}, Inertia: {kmeans_single.inertia_}")
    inertia.append(kmeans_single.inertia_)

    df_cluster["cluster"] = kmeans_single.labels_.astype("str")

    fig = px.histogram(df_cluster, x="rating", nbins=20, color="cluster")
    fig.show()

    scat = px.scatter(df_cluster, x="rating", y="zero", color="cluster")
    scat.show()

In [None]:
df_inertia = pd.DataFrame(
    data=[list(range(1, clusters)), inertia], index=["clusters", "inertia"]
).T

df_inertia["improvement"] = (
    -df_inertia["inertia"].diff() / df_inertia["inertia"].shift()
)

df_inertia

In [None]:
scat = px.scatter(df_inertia, x="clusters", y="inertia")
scat.show()

impro = px.scatter(df_inertia, x="clusters", y="improvement")
impro.show()

In [None]:
n_clusters = 6

kmeans_single = cluster.KMeans(n_clusters=n_clusters)
kmeans_single = kmeans_single.fit(df_all[["rating"]])

print(f"Number of clusters {n_clusters}, Inertia: {kmeans_single.inertia_}")

df_cluster["cluster"] = kmeans_single.labels_.astype("str")

fig = px.histogram(df_cluster, x="rating", nbins=20, color="cluster")
fig.show()

scat = px.scatter(df_cluster, x="rating", y="zero", color="cluster")
scat.show()

In [None]:
for cl, df in df_cluster.groupby("cluster"):
    display(f"Cluster #{cl}")
    display(df.describe())

# Check K-means method for evaluation of the clustering

## Cluster the data [Juraj]
* Using K-Means (Univariate, Multivariate)
* Evaluate quality of the clustering (inertia)
* Visualize the clustering