In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
from kmodes.kmodes import KModes
from sklearn.metrics import adjusted_rand_score, silhouette_score  

In [2]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('dermatology.csv', delimiter='\t')

# convert 'Age' column to numeric for prepare categorization
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# categorize the 'Age' column as age is continuous
for i in range(len(df)):
    age = df.iloc[i]['Age']
    
    if age == '?':
        df.iloc[i, df.columns.get_loc('Age')] = None 
        continue
        
    category = 0
    if 18 < age <= 35:
        category = 1
    elif age <= 50:
        category = 2
    elif age <= 65:
        category = 3
    elif age <= 125:
        category = 4
    df.iloc[i, df.columns.get_loc('Age')] = category

# clean the dataset for missing and NaN values
df = df.dropna()

# print the first five entries
print(df.head())

   Erythema  Scathing  Definite Borders  Itching  Koebner   Polygonal  \
0         2         2                 0        3         0          0   
1         3         3                 3        2         1          0   
2         2         1                 2        3         1          3   
3         2         2                 2        0         0          0   
4         2         3                 2        2         2          2   

   Follicular  Oral  Knee  Scalp  ...  Disapperance  Vacuolisation  \
0           0     0     1      0  ...             0              0   
1           0     0     1      1  ...             0              0   
2           0     3     0      0  ...             0              2   
3           0     0     3      2  ...             3              0   
4           0     2     0      0  ...             2              3   

   Spongiosis  Retes  Follicular.1  Perifollicular  Inflamatory  Band-like  \
0           3      0             0               0            

In [3]:
# initialize the K-Modes model with 6 clusters
kmode = KModes(n_clusters=6, init='random', n_init=10, verbose=1)

# fit the model and predict clusters
clusters = kmode.fit_predict(df)

# insert the cluster labels into the dataset
df.insert(0, 'Cluster', clusters, True)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 134, cost: 3062.0
Run 1, iteration: 2/100, moves: 44, cost: 3025.0
Run 1, iteration: 3/100, moves: 8, cost: 3018.0
Run 1, iteration: 4/100, moves: 10, cost: 3010.0
Run 1, iteration: 5/100, moves: 2, cost: 3010.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 144, cost: 3241.0
Run 2, iteration: 2/100, moves: 29, cost: 3241.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 135, cost: 3002.0
Run 3, iteration: 2/100, moves: 38, cost: 2935.0
Run 3, iteration: 3/100, moves: 15, cost: 2919.0
Run 3, iteration: 4/100, moves: 11, cost: 2915.0
Run 3, iteration: 5/100, moves: 10, cost: 2911.0
Run 3, iteration: 6/100, moves: 0, cost: 2911.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 126, cost: 

In [4]:
# extract true labels
true_labels = df['Disease'].values

# calculate and print ARI for K-Modes
ari = adjusted_rand_score(true_labels, clusters)
print(f'Adjusted Rand Index for K-Modes: {ari}')

# calculate and print Silhouette Score for K-Modes
silhouette = silhouette_score(df.drop(['Cluster'], axis=1), clusters)
print(f'Silhouette Score for K-Modes: {silhouette}')

Adjusted Rand Index for K-Modes: 0.8777083879920334
Silhouette Score for K-Modes: 0.2550818782988355
