In [1]:
import numpy as np
import pandas as pd

from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score

from src import akeneo, akeneo_clustering, clustering

In [2]:
cache = akeneo.create_cache_from_env()

In [3]:
pd.DataFrame(cache.families)

Unnamed: 0,code,labels,attributes,attribute_requirements
0,mobile_phone_cases,"{'en_US': 'Mobile Phone Cases', 'en_GB': 'Mobi...","[ean, icecat_12935, icecat_1464, icecat_15767,...","{'default': ['ean', 'icecat_6767', 'icecat_815..."
1,smartphones,"{'en_US': 'Smartphones', 'en_GB': 'Smartphones...","[ean, icecat_10035, icecat_10101, icecat_10102...","{'default': ['ean', 'icecat_11379', 'icecat_12..."


In [4]:
products = akeneo_clustering.parse_products(
    cache,
    product_family="mobile_phone_cases",
    attribute_types=akeneo_clustering.TYPES_NUMERICAL+akeneo_clustering.TYPES_CATEGORICAL,
)

In [5]:
products_df = pd.DataFrame(products).sort_index(axis=1)
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   __categories__      80 non-null     object 
 1   __family__          80 non-null     object 
 2   __id__              80 non-null     object 
 3   icecat_1464         24 non-null     float64
 4   icecat_15767        8 non-null      object 
 5   icecat_1649         24 non-null     float64
 6   icecat_1650         24 non-null     float64
 7   icecat_26241        1 non-null      object 
 8   icecat_27575_fixed  27 non-null     object 
 9   icecat_38673        6 non-null      object 
 10  icecat_4463_fixed   1 non-null      object 
 11  icecat_4860         58 non-null     object 
 12  icecat_6767         76 non-null     float64
 13  icecat_8006         51 non-null     object 
 14  icecat_8156         79 non-null     object 
 15  icecat_8411         76 non-null     object 
 16  icecat_877

In [6]:
products_filtered_df = products_df[~products_df["icecat_1649"].isna()].dropna(axis=1)
products_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 28 to 59
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   __categories__  24 non-null     object 
 1   __family__      24 non-null     object 
 2   __id__          24 non-null     object 
 3   icecat_1464     24 non-null     float64
 4   icecat_1649     24 non-null     float64
 5   icecat_1650     24 non-null     float64
 6   icecat_4860     24 non-null     object 
 7   icecat_6767     24 non-null     float64
 8   icecat_8156     24 non-null     object 
 9   icecat_8411     24 non-null     object 
 10  icecat_8778     24 non-null     object 
 11  icecat_94       24 non-null     float64
dtypes: float64(5), object(7)
memory usage: 2.4+ KB


In [7]:
cols_num = ["icecat_1464", "icecat_1649", "icecat_1650", "icecat_6767", "icecat_94"]
cols_cat = ["icecat_4860", "icecat_8156", "icecat_8411", "icecat_8778"]

In [8]:
labels_want = products_filtered_df["__categories__"].map(lambda x: x[0]).to_numpy()
labels_want

array(['s20', 's20', 's20', 's20', 's20', 's20', 's20', 's20_ultra',
       's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra',
       's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra', 's20_plus',
       's20_plus', 's20_plus', 's20_plus', 's20_plus', 's20_plus',
       's21_fe'], dtype=object)

In [9]:
labels_unique = pd.Series(labels_want).drop_duplicates()
k = labels_unique.count()
labels_unique

0           s20
7     s20_ultra
17     s20_plus
23       s21_fe
dtype: object

In [10]:
prod_num = products_filtered_df[cols_num]
prod_cat = products_filtered_df[cols_cat]
prod_mix = products_filtered_df[cols_num + cols_cat]

dataset_num = akeneo_clustering.dataset_from_records(prod_num.to_dict("records"))
dataset_cat = akeneo_clustering.dataset_from_records(prod_cat.to_dict("records"))
dataset_mix = akeneo_clustering.dataset_from_records(prod_mix.to_dict("records"))

numpy_num = prod_num.to_numpy()
numpy_cat = prod_cat.to_numpy()
numpy_mix = prod_mix.to_numpy()

In [11]:
prox_mat_num = akeneo_clustering.calc_proximity_matrix(dataset_num)
prox_mat_cat = akeneo_clustering.calc_proximity_matrix(dataset_cat)
prox_mat_mix = akeneo_clustering.calc_proximity_matrix(dataset_mix)

In [12]:
def calc_metrics(cluster_func, proximity_matrix):
    labels = cluster_func(0)

    stabilities = []
    for i in range(1, 10):
        labels_2 = cluster_func(i)
        stabilities.append(adjusted_rand_score(labels, labels_2))

    return {
        "Stabilität": np.array(stabilities).mean(),
        "Qualität": silhouette_score(proximity_matrix, labels, metric="precomputed"),
        "Korrektheit": adjusted_rand_score(labels_want, labels)
    }

In [13]:
algos = [
    ("Bi-KMeans HD", lambda x: clustering.BisectingKMeans(dataset_num, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("KMeans HD", lambda x: clustering.KMeans(dataset_num, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("KMeans SKLearn", lambda x: KMeans(k, init="random", random_state=x).fit_predict(numpy_num)),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_num)
pd.DataFrame(tmp)

Unnamed: 0,Bi-KMeans HD,KMeans HD,KMeans SKLearn
Stabilität,1.0,0.961884,0.975408
Qualität,0.629768,0.647547,0.629768
Korrektheit,0.499604,0.536384,0.499604


In [14]:
algos = [
    ("Bi-KMeans HD", lambda x: clustering.BisectingKMeans(dataset_cat, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("KMeans HD", lambda x: clustering.KMeans(dataset_cat, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("nicodv/KModes", lambda x: KModes(k, init="random", random_state=x).fit_predict(numpy_cat)),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_cat)
pd.DataFrame(tmp)

Unnamed: 0,Bi-KMeans HD,KMeans HD,nicodv/KModes
Stabilität,0.977108,0.565926,1.0
Qualität,0.958333,-0.327778,0.958333
Korrektheit,0.083817,0.06406,0.083817


In [15]:
n_num = len(cols_num)
n_cat = len(cols_cat)

algos = [
    ("Bi-KMeans HD", lambda x: clustering.BisectingKMeans(dataset_mix, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("KMeans HD", lambda x: clustering.KMeans(dataset_mix, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("nicodv/KPrototypes", lambda x: KPrototypes(k, init="random", random_state=x).fit_predict(numpy_mix, categorical=list(range(n_num, n_num+n_cat)))),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_mix)
pd.DataFrame(tmp)

Unnamed: 0,Bi-KMeans HD,KMeans HD,nicodv/KPrototypes
Stabilität,0.985825,1.0,0.964562
Qualität,0.714892,0.714892,0.714892
Korrektheit,0.406219,0.406219,0.406219
