In [1]:
import itertools

import numpy as np
import pandas as pd

from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans
from sklearn.metrics import rand_score, silhouette_score

from src import akeneo, akeneo_clustering, clustering

In [2]:
cache = akeneo.create_cache_from_env()

In [3]:
pd.DataFrame(cache.families)

Unnamed: 0,code,labels,attributes,attribute_requirements
0,mobile_phone_cases,"{'en_US': 'Mobile Phone Cases', 'en_GB': 'Mobi...","[ean, icecat_12935, icecat_1464, icecat_15767,...","{'default': ['ean', 'icecat_6767', 'icecat_815..."
1,smartphones,"{'en_US': 'Smartphones', 'en_GB': 'Smartphones...","[ean, icecat_10035, icecat_10101, icecat_10102...","{'default': ['ean', 'icecat_11379', 'icecat_12..."


In [4]:
prods = filter(lambda prod: prod.family == "mobile_phone_cases", cache.products)
products = akeneo_clustering.parse_products(cache, list(prods), "default", "en_US", "USD")

In [5]:
products_df = pd.DataFrame(products)
products_df = products_df[products_df["__family__"] == "mobile_phone_cases"].dropna(axis=1, how="all")
products_df

Unnamed: 0,__id__,__family__,__categories__,icecat_8156,icecat_8411,icecat_8778,icecat_27575_fixed,icecat_image_0,ean,icecat_name,...,icecat_94,icecat_1464,icecat_1649,icecat_1650,icecat_26241,icecat_9689,icecat_15767,icecat_38673,icecat_4463_fixed,icecat_4463
0,104889,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,0/a/9/1/0a91f00d0f19027c87a3d5cfa12ca2b3ed6f16...,8718066381026,104889,...,,,,,,,,,,
1,MP-104927,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,2/6/c/9/26c9db2234b5453445df17e2b5595abc38402c...,8718066381170,MP-104927,...,,,,,,,,,,
2,MP-104925,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,0/e/f/3/0ef3327abba69987cb51115bbd6cae26555339...,8718066381156,MP-104925,...,,,,,,,,,,
3,MP-108422,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,e/c/1/9/ec19013b1c5c35f111bfc0e778a6f4727d321c...,8718066383501,MP-108422,...,,,,,,,,,,
4,MP-104926,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,d/3/8/e/d38e2b463817add5225cb6c41bf8bbfc4971df...,8718066381163,MP-104926,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0237,mobile_phone_cases,[s20_ultra],348496968,348496965,367987544,not_supported,0/a/b/f/0abf291e57ef6cd52173ca94202aa37393c262...,5711724002373,0237,...,,,,,,,True,True,,
76,0375,mobile_phone_cases,[s22],348496968,,,,2/b/1/a/2b1a10146ac652606b453e27d37d0a74e83c5b...,5711724003752,0375,...,,,,,,,True,,,
77,0376,mobile_phone_cases,[s22_ultra],348496968,,,,0/2/4/3/02435c18bdaf8c18afe28e50f4217f2a5132b2...,5711724003769,0376,...,,,,,,,True,,,
78,0371,mobile_phone_cases,[s22],348496968,,,,9/5/f/6/95f6a73371d12db1e9bbe401853abaaa3506de...,5711724003714,0371,...,,,,,,,True,True,,


In [6]:
t = akeneo.AttributeType
types_numerical = [t.DATE, t.METRIC, t.NUMBER, t.PRICE]
types_categorical = [t.BOOL, t.SELECT_SINGLE, t.REFERENCE_SINGLE]

attr_codes = products_df.columns

attr_df = pd.DataFrame(cache.attributes)
attr_df = attr_df[
    attr_df["code"].isin(attr_codes)
    & (attr_df["group"] != "faulty")
    & attr_df["type"].isin(types_numerical + types_categorical)
]
attr_df.head(3)

Unnamed: 0,code,labels,type,localizable,scopable,unique,group,group_labels,sort_order,allowed_extensions,...,max_file_size,metric_family,minimum_input_length,negative_allowed,number_min,number_max,reference_data_name,validation_rule,validation_regexp,wysiwyg_enabled
39,icecat_1464,"{'de_DE': 'Höhe', 'en_GB': 'Height', 'en_US': ...",AttributeType.METRIC,True,True,False,weight_and_dimensions,"{'en_US': 'Weight & dimensions', 'en_GB': 'Wei...",1464,[],...,,Length,,False,,,,,,
53,icecat_15767,"{'de_DE': 'Einfache Anwendung', 'en_GB': 'Easy...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",15767,[],...,,,,,,,,,,
60,icecat_1649,"{'de_DE': 'Breite', 'en_GB': 'Width', 'en_US':...",AttributeType.METRIC,True,True,False,weight_and_dimensions,"{'en_US': 'Weight & dimensions', 'en_GB': 'Wei...",1649,[],...,,Length,,False,,,,,,


In [7]:
cols = akeneo_clustering.KEYS + attr_df["code"].to_list()

prod_df = products_df[cols]
prod_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 79
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   __id__              80 non-null     object 
 1   __family__          80 non-null     object 
 2   __categories__      80 non-null     object 
 3   icecat_1464         24 non-null     float64
 4   icecat_15767        8 non-null      object 
 5   icecat_1649         24 non-null     float64
 6   icecat_1650         24 non-null     float64
 7   icecat_26241        1 non-null      object 
 8   icecat_27575_fixed  27 non-null     object 
 9   icecat_38673        6 non-null      object 
 10  icecat_4463_fixed   1 non-null      object 
 11  icecat_4860         58 non-null     object 
 12  icecat_6767         76 non-null     float64
 13  icecat_8006         51 non-null     object 
 14  icecat_8156         79 non-null     object 
 15  icecat_8411         76 non-null     object 
 16  icecat_877

In [8]:
prod_df = prod_df[~prod_df["icecat_1649"].isna()].dropna(axis=1)
prod_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 28 to 59
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   __id__          24 non-null     object 
 1   __family__      24 non-null     object 
 2   __categories__  24 non-null     object 
 3   icecat_1464     24 non-null     float64
 4   icecat_1649     24 non-null     float64
 5   icecat_1650     24 non-null     float64
 6   icecat_4860     24 non-null     object 
 7   icecat_6767     24 non-null     float64
 8   icecat_8156     24 non-null     object 
 9   icecat_8411     24 non-null     object 
 10  icecat_8778     24 non-null     object 
 11  icecat_94       24 non-null     float64
dtypes: float64(5), object(7)
memory usage: 2.4+ KB


In [9]:
cols_num = ["icecat_1464", "icecat_1649", "icecat_1650", "icecat_6767", "icecat_94"]
cols_cat = ["icecat_4860", "icecat_8156", "icecat_8411", "icecat_8778"]

In [10]:
labels_want = prod_df["__categories__"].map(lambda x: x[0]).to_numpy()
labels_want

array(['s20', 's20', 's20', 's20', 's20', 's20', 's20', 's20_ultra',
       's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra',
       's20_ultra', 's20_ultra', 's20_ultra', 's20_ultra', 's20_plus',
       's20_plus', 's20_plus', 's20_plus', 's20_plus', 's20_plus',
       's21_fe'], dtype=object)

In [11]:
k = 4
pd.Series(labels_want).drop_duplicates()

0           s20
7     s20_ultra
17     s20_plus
23       s21_fe
dtype: object

In [12]:
prod_num = prod_df[cols_num]
prod_cat = prod_df[cols_cat]
prod_mix = prod_df[cols_num + cols_cat]

dataset_num = akeneo_clustering.dataset_from_records(prod_num.to_dict("records"))
dataset_cat = akeneo_clustering.dataset_from_records(prod_cat.to_dict("records"))
dataset_mix = akeneo_clustering.dataset_from_records(prod_mix.to_dict("records"))

numpy_num = prod_num.to_numpy()
numpy_cat = prod_cat.to_numpy()
numpy_mix = prod_mix.to_numpy()

In [13]:
prox_mat_num = akeneo_clustering.calc_proximity_matrix(dataset_num)
prox_mat_cat = akeneo_clustering.calc_proximity_matrix(dataset_cat)
prox_mat_mix = akeneo_clustering.calc_proximity_matrix(dataset_mix)

In [14]:
def calc_metrics(cluster_func, proximity_matrix):
    results = []
    silhouettes = []
    accurates = []

    for i in range(10):
        labels = cluster_func(i)

        silhouettes.append(silhouette_score(proximity_matrix, labels))
        accurates.append(rand_score(labels_want, labels))
        results.append(labels)

    stabilities = []
    combinations = list(itertools.combinations(range(10), 2))
    for i, j in combinations:
        stabilities.append(rand_score(results[i], results[j]))

    return {
        "Stabilität": np.array(stabilities).mean(),
        "Qualität": np.array(silhouettes).mean(),
        "Korrektheit": np.array(accurates).mean()
    }

In [15]:
algos = [
    ("KMeans", lambda x: clustering.KMeans(dataset_num, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("Bi-KMeans", lambda x: clustering.BisectingKMeans(dataset_num, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("SKLearn", lambda x: KMeans(k, random_state=x).fit_predict(numpy_num)),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_num)
pd.DataFrame(tmp)

Unnamed: 0,KMeans,Bi-KMeans,SKLearn
Stabilität,0.974638,1.0,1.0
Qualität,0.702016,0.661081,0.661081
Korrektheit,0.817029,0.800725,0.800725


In [16]:
algos = [
    ("KMeans", lambda x: clustering.KMeans(dataset_cat, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("Bi-KMeans", lambda x: clustering.BisectingKMeans(dataset_cat, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("KModes", lambda x: KModes(k, random_state=x).fit_predict(numpy_cat)),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_cat)
pd.DataFrame(tmp)

Unnamed: 0,KMeans,Bi-KMeans,KModes
Stabilität,0.896457,0.97971,1.0
Qualität,0.264437,0.83628,0.958333
Korrektheit,0.548551,0.563043,0.565217


In [17]:
n_num = len(cols_num)
n_cat = len(cols_cat)

algos = [
    ("KMeans", lambda x: clustering.KMeans(dataset_mix, akeneo_clustering.Centroid, k, random_state=x).labels),
    ("Bi-KMeans", lambda x: clustering.BisectingKMeans(dataset_mix, akeneo_clustering.Centroid, random_state=x).labels_flat(k)),
    ("KPrototypes", lambda x: KPrototypes(k, random_state=x).fit_predict(numpy_mix, categorical=list(range(n_num, n_num+n_cat)))),
]
tmp = {}
for name, cluster_func in algos:
    tmp[name] = calc_metrics(cluster_func, prox_mat_mix)
pd.DataFrame(tmp)

Unnamed: 0,KMeans,Bi-KMeans,KPrototypes
Stabilität,1.0,0.990982,0.979066
Qualität,0.698136,0.688147,0.661703
Korrektheit,0.76087,0.763043,0.77029
