In [1]:

# Import necessary libraries
import pandas as pd  # pandas offers data structures and operations for manipulating numerical tables and time series.
import numpy as np  # numpy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
from kmodes.kmodes import KModes  # KModes is used for clustering categorical variables.


In [2]:

# Create the data we need
# Define arrays for hair, eye, and skin colors to simulate categorical data of individuals.
hair_color = np.array(['blonde', 'brunette', 'red', 'black', 'brunette', 'black', 'red', 'black'])
eye_color = np.array(['amber', 'gray', 'green', 'hazel', 'amber', 'gray', 'green', 'hazel'])
skin_color = np.array(['fair', 'brown', 'brown', 'brown', 'fair', 'brown', 'fair', 'fair'])
person = ['P1','P2','P3','P4','P5','P6','P7','P8']  # Identifiers for each person.

# Combine the arrays into a pandas DataFrame for easy manipulation. Each row represents an individual's attributes.
data = pd.DataFrame({'person':person, 'hair_color':hair_color, 'eye_color':eye_color, 'skin_color':skin_color})

# Set the 'person' column as the index of the DataFrame to uniquely identify each row.
data = data.set_index('person')  # Our data is indexed on ‘person’ attribute, making it easier to reference individuals.


In [4]:

# Building the model with 3 clusters
# Initialize a KModes instance with 3 clusters, using the "random" initialization method. 
# 'n_init' specifies the number of initialization attempts. This is useful because k-modes is sensitive to initial placement of centroids.
# 'verbose=1' enables verbose output during the fitting process, useful for learning or debugging.
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)

# Fit the model to the dataset and predict the clusters for each observation. 
# This step actually performs the clustering by finding centroids and assigning data points to the nearest cluster.
clusters = kmode.fit_predict(data)

# Insert the cluster labels into the original data frame. 
# 'clusters' contains the cluster assignment for each individual, and we're adding this as a new column to the dataframe.
# The 'True' argument in 'insert' method specifies that if the column "Cluster" already exists, it will be replaced.
data.insert(0, "Cluster", clusters, True)

data

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 2, cost: 6.0
Run 1, iteration: 2/100, moves: 0, cost: 6.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 1, cost: 9.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 2, cost: 7.0
Run 3, iteration: 2/100, moves: 0, cost: 7.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 2, cost: 9.0
Run 4, iteration: 2/100, moves: 0, cost: 9.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 1, cost: 10.0
Best run was number 1


Unnamed: 0_level_0,Cluster,Cluster,hair_color,eye_color,skin_color
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,blonde,amber,fair
P2,0,1,brunette,gray,brown
P3,2,0,red,green,brown
P4,0,1,black,hazel,brown
P5,1,2,brunette,amber,fair
P6,0,1,black,gray,brown
P7,2,0,red,green,fair
P8,2,0,black,hazel,fair
