In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import adjusted_rand_score, silhouette_score

from src import akeneo, akeneo_clustering as ac, clustering

In [2]:
cache = akeneo.create_cache_from_env()

In [3]:
products = ac.parse_products(
    cache,
    product_family="mobile_phone_cases",
    attribute_types=ac.TYPES_NUMERICAL + ac.TYPES_CATEGORICAL + ac.TYPES_MULTI,
)

In [4]:
products_df = pd.DataFrame(products)
products_df

Unnamed: 0,__id__,__family__,__categories__,icecat_8156,icecat_8411,icecat_8778,icecat_27575_fixed,icecat_4860,icecat_8006,icecat_6767,icecat_898_fixed,icecat_94,icecat_1464,icecat_1649,icecat_1650,icecat_26241,icecat_9689,icecat_15767,icecat_38673,icecat_4463_fixed
0,104889,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,False,False,0.0,,,,,,,,,,
1,MP-104927,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,False,False,0.0,"{thermoplastic_polyurethane_tpu, silicone}",,,,,,,,,
2,MP-104925,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,False,False,0.0,"{thermoplastic_polyurethane_tpu, silicone}",,,,,,,,,
3,MP-108422,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,False,False,0.0,"{thermoplastic_polyurethane_tpu, silicone}",,,,,,,,,
4,MP-104926,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,False,False,0.0,"{thermoplastic_polyurethane_tpu, silicone}",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0237,mobile_phone_cases,[s20_ultra],348496968,348496965,367987544,not_supported,,,1.0,"{tempered_glass, thermoplastic_polyurethane_tpu}",,,,,,,True,True,
76,0375,mobile_phone_cases,[s22],348496968,,,,,,,,,,,,,,True,,
77,0376,mobile_phone_cases,[s22_ultra],348496968,,,,,,,,,,,,,,True,,
78,0371,mobile_phone_cases,[s22],348496968,,,,,,,"{thermoplastic_polyurethane_tpu, polycarbonate...",,,,,,,True,True,


In [5]:
labels_want_series = products_df["__categories__"].map(lambda x: x[0].split("_")[0]).to_numpy()
k_series = 3
pd.Series(labels_want_series).value_counts()

s20    45
s21    31
s22     4
dtype: int64

In [6]:
labels_want_models = products_df["__categories__"].map(lambda x: x[0]).to_numpy()
k_models = 11
pd.Series(labels_want_models).value_counts()

s20          18
s20_plus     14
s21          11
s20_ultra    11
s21_plus     10
s21_fe        5
s21_ultra     5
s20_fe        2
s22           2
s22_ultra     1
s22_plus      1
dtype: int64

In [7]:
attr_df = pd.DataFrame(cache.attributes)
attr_df = attr_df[attr_df["code"].isin(products_df.columns)]
attr_df

Unnamed: 0,code,labels,type,localizable,scopable,unique,group,group_labels,sort_order,allowed_extensions,...,max_file_size,metric_family,minimum_input_length,negative_allowed,number_min,number_max,reference_data_name,validation_rule,validation_regexp,wysiwyg_enabled
39,icecat_1464,"{'de_DE': 'Höhe', 'en_GB': 'Height', 'en_US': ...",AttributeType.METRIC,True,True,False,weight_and_dimensions,"{'en_US': 'Weight & dimensions', 'en_GB': 'Wei...",1464,[],...,,Length,,False,,,,,,
53,icecat_15767,"{'de_DE': 'Einfache Anwendung', 'en_GB': 'Easy...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",15767,[],...,,,,,,,,,,
60,icecat_1649,"{'de_DE': 'Breite', 'en_GB': 'Width', 'en_US':...",AttributeType.METRIC,True,True,False,weight_and_dimensions,"{'en_US': 'Weight & dimensions', 'en_GB': 'Wei...",1649,[],...,,Length,,False,,,,,,
61,icecat_1650,"{'de_DE': 'Tiefe', 'en_GB': 'Depth', 'en_US': ...",AttributeType.METRIC,True,True,False,weight_and_dimensions,"{'en_US': 'Weight & dimensions', 'en_GB': 'Wei...",1650,[],...,,Length,,False,,,,,,
137,icecat_26241,"{'de_DE': 'Kartentasche', 'en_GB': 'Card pocke...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",26241,[],...,,,,,,,,,,
144,icecat_27575_fixed,"{'de_DE': 'Verschluss', 'en_GB': 'Closure', 'e...",AttributeType.SELECT_SINGLE,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",27575,[],...,,,,,,,,,,
233,icecat_38673,"{'de_DE': 'Fettabweisende Beschichtung', 'en_G...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",38673,[],...,,,,,,,,,,
339,icecat_4463_fixed,"{'de_DE': 'Verpackungsart', 'en_GB': 'Package ...",AttributeType.SELECT_SINGLE,True,True,False,packaging_data,"{'en_US': 'Packaging data', 'en_GB': 'Packagin...",4463,[],...,,,,,,,,,,
349,icecat_4860,"{'de_DE': 'Desktop-Ständer', 'en_GB': 'Desktop...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",4860,[],...,,,,,,,,,,
365,icecat_6767,"{'de_DE': 'Maximale Bildschirmgröße', 'en_GB':...",AttributeType.METRIC,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",6767,[],...,,Angle,,False,,,,,,


In [8]:
attr_codes_req = cache.families[0].attribute_requirements["default"]
attr_codes_req = attr_df[attr_df["code"].isin(attr_codes_req)]["code"].to_list()
attr_codes_req

['icecat_6767', 'icecat_8156', 'icecat_8778', 'icecat_898_fixed']

In [9]:
attr_df[attr_df["code"].isin(attr_codes_req)]

Unnamed: 0,code,labels,type,localizable,scopable,unique,group,group_labels,sort_order,allowed_extensions,...,max_file_size,metric_family,minimum_input_length,negative_allowed,number_min,number_max,reference_data_name,validation_rule,validation_regexp,wysiwyg_enabled
365,icecat_6767,"{'de_DE': 'Maximale Bildschirmgröße', 'en_GB':...",AttributeType.METRIC,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",6767,[],...,,Angle,,False,,,,,,
405,icecat_8156,"{'de_DE': 'Markenkompatibilität', 'en_GB': 'Br...",AttributeType.SELECT_SINGLE,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",8156,[],...,,,,,,,,,,
422,icecat_8778,"{'de_DE': 'Etui-Typ', 'en_GB': 'Case type', 'e...",AttributeType.SELECT_SINGLE,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",8778,[],...,,,,,,,,,,
426,icecat_898_fixed,"{'de_DE': 'Material', 'en_GB': 'Material', 'en...",AttributeType.SELECT_MULTI,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",898,[],...,,,,,,,,,,


In [10]:
attr_codes_req_num = ["icecat_6767"]
attr_codes_req_cat = ["icecat_8156", "icecat_8778"]
attr_codes_req_mul = ["icecat_898_fixed"]

In [11]:
attr_codes_all = attr_df["code"].to_list()
attr_codes_all_num = attr_df[attr_df["type"].isin(ac.TYPES_NUMERICAL)]["code"].to_list()
attr_codes_all_cat = attr_df[attr_df["type"].isin(ac.TYPES_CATEGORICAL)]["code"].to_list()
attr_codes_all_mul = attr_df[attr_df["type"].isin(ac.TYPES_MULTI)]["code"].to_list()

pd.DataFrame({
    "attr_codes_all": [attr_codes_all],
    "attr_codes_all_num": [attr_codes_all_num],
    "attr_codes_all_cat": [attr_codes_all_cat],
    "attr_codes_all_mul": [attr_codes_all_mul],
}).transpose()

Unnamed: 0,0
attr_codes_all,"[icecat_1464, icecat_15767, icecat_1649, iceca..."
attr_codes_all_num,"[icecat_1464, icecat_1649, icecat_1650, icecat..."
attr_codes_all_cat,"[icecat_15767, icecat_26241, icecat_27575_fixe..."
attr_codes_all_mul,[icecat_898_fixed]


In [12]:
def cluster_dataset(dataset, random_state) -> clustering.BisectingKMeans:
    return clustering.BisectingKMeans(dataset, ac.Centroid, random_state=random_state)


def calc_metrics(dataset):
    proximity_matrix = ac.calc_proximity_matrix(dataset)

    bi_kmeans = cluster_dataset(dataset, 0)
    labels_series = bi_kmeans.labels_flat(k_series)
    labels_models = bi_kmeans.labels_flat(k_models)

    stabilities_series = []
    stabilities_models = []
    for i in range(1, 10):
        bi_kmeans_2 = cluster_dataset(dataset, i)
        stabilities_series.append(
            adjusted_rand_score(labels_series, bi_kmeans_2.labels_flat(k_series))
        )
        stabilities_models.append(
            adjusted_rand_score(labels_models, bi_kmeans_2.labels_flat(k_models))
        )

    return {
        ("Stabilität", "Serie"): np.array(stabilities_series).mean(),
        ("Stabilität", "Model"): np.array(stabilities_models).mean(),
        ("Qualität", "Serie"): silhouette_score(proximity_matrix, labels_series, metric="precomputed"),
        ("Qualität", "Model"): silhouette_score(proximity_matrix, labels_models, metric="precomputed"),
        ("Korrektheit", "Serie"): adjusted_rand_score(labels_want_series, labels_series),
        ("Korrektheit", "Model"): adjusted_rand_score(labels_want_models, labels_models),
    }

In [13]:
data_all = ac.dataset_from_records(products_df[attr_codes_all].to_dict("records"))
data_req = ac.dataset_from_records(products_df[attr_codes_req].to_dict("records"))

data_all_num = ac.dataset_from_records(products_df[attr_codes_all_num].to_dict("records"))
data_req_num = ac.dataset_from_records(products_df[attr_codes_req_num].to_dict("records"))

data_all_cat = ac.dataset_from_records(products_df[attr_codes_all_cat].to_dict("records"))
data_req_cat = ac.dataset_from_records(products_df[attr_codes_req_cat].to_dict("records"))

data_mul = ac.dataset_from_records(products_df[attr_codes_all_mul].to_dict("records"))

data_all_num_cat = ac.dataset_from_records(products_df[attr_codes_all_num+attr_codes_all_cat].to_dict("records"))
data_req_num_cat = ac.dataset_from_records(products_df[attr_codes_req_num+attr_codes_req_cat].to_dict("records"))

In [14]:
cases = [
    (("numerical", "all"), data_all_num),
    (("numerical", "required"), data_req_num),
    (("categorical", "all"), data_all_cat),
    (("categorical", "required"), data_req_cat),
    (("multi", "all"), data_mul),
    (("num+cat", "all"), data_all_num_cat),
    (("num+cat", "required"), data_req_num_cat),
    (("num+cat+mul", "all"), data_all),
    (("num+cat+mul", "required"), data_req),
]
result = {}
for col_name, dataset in cases:
    result[col_name] = calc_metrics(dataset)
pd.DataFrame(result)

Unnamed: 0_level_0,Unnamed: 1_level_0,numerical,numerical,categorical,categorical,multi,num+cat,num+cat,num+cat+mul,num+cat+mul
Unnamed: 0_level_1,Unnamed: 1_level_1,all,required,all,required,all,all,required,all,required
Stabilität,Serie,0.954684,0.818024,1.0,0.850738,0.736806,1.0,0.927918,0.993787,0.940299
Stabilität,Model,0.884045,0.953707,0.927454,0.572355,0.707913,0.977562,0.954156,0.92426,0.906319
Qualität,Serie,0.610826,0.586937,0.350063,0.95625,0.404266,0.422325,0.683163,0.414713,0.351281
Qualität,Model,0.739396,0.95,0.887166,-0.6375,0.505937,0.647837,0.971481,0.443559,0.464924
Korrektheit,Serie,0.146734,-0.006015,0.083232,0.119365,0.051856,0.362091,0.096212,0.264894,0.019042
Korrektheit,Model,0.327951,0.625566,0.123571,0.037584,0.013523,0.30036,0.551295,0.309848,0.202768


In [15]:
col_mapping = {}
for _, code, labels in attr_df[["code", "labels"]].itertuples():
    label = labels["en_US"]
    col_mapping[code] = f"{label} – {code}"

products_df.rename(col_mapping, axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 20 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   __id__                                80 non-null     object 
 1   __family__                            80 non-null     object 
 2   __categories__                        80 non-null     object 
 3   Brand compatibility – icecat_8156     79 non-null     object 
 4   Surface coloration – icecat_8411      76 non-null     object 
 5   Case type – icecat_8778               76 non-null     object 
 6   Closure – icecat_27575_fixed          27 non-null     object 
 7   Desktop stand – icecat_4860           58 non-null     object 
 8   Built-in battery – icecat_8006        51 non-null     object 
 9   Maximum screen size – icecat_6767     76 non-null     float64
 10  Material – icecat_898_fixed           57 non-null     object 
 11  Weight – icecat_94   

In [16]:
attr_codes_filled = ["icecat_8156", "icecat_8411", "icecat_8778", "icecat_6767"]

attr_codes_null = ["icecat_26241", "icecat_9689", "icecat_15767", "icecat_38673", "icecat_4463_fixed"]
attr_codes_not_null = list(filter(lambda a: a not in attr_codes_null, attr_codes_all))

pd.DataFrame({
    "all attributes": calc_metrics(data_all),
    "filled": calc_metrics(ac.dataset_from_records(products_df[attr_codes_filled].to_dict("records"))),
    "not many null": calc_metrics(ac.dataset_from_records(products_df[attr_codes_not_null].to_dict("records"))),
})

Unnamed: 0,Unnamed: 1,all attributes,filled,not many null
Stabilität,Serie,0.993787,0.927918,1.0
Stabilität,Model,0.92426,0.946659,0.938388
Qualität,Serie,0.414713,0.665771,0.480697
Qualität,Model,0.443559,0.310053,0.427119
Korrektheit,Serie,0.264894,0.096212,0.216965
Korrektheit,Model,0.309848,0.535888,0.351921


In [21]:
result = {}
for attr_code in attr_codes_all:
    dataset = ac.dataset_from_records(products_df[[attr_code]].to_dict("records"))
    result[attr_code] = calc_metrics(dataset)
pd.DataFrame(result)

Unnamed: 0,Unnamed: 1,icecat_1464,icecat_15767,icecat_1649,icecat_1650,icecat_26241,icecat_27575_fixed,icecat_38673,icecat_4463_fixed,icecat_4860,icecat_6767,icecat_8006,icecat_8156,icecat_8411,icecat_8778,icecat_898_fixed,icecat_94,icecat_9689
Stabilität,Serie,-0.0353,0.162243,-0.035902,-0.034551,0.031127,0.933776,0.156024,0.031127,0.876314,0.818024,0.038292,0.031127,0.154746,0.855856,0.736806,-0.032537,0.031127
Stabilität,Model,0.403552,0.251784,0.557636,0.603565,0.05742,0.610998,0.163093,0.05742,0.911393,0.953707,0.723431,0.069519,0.38703,0.655805,0.707913,0.541563,0.05742
Qualität,Serie,0.056926,0.1,0.053019,0.054779,0.0,0.325,0.075,0.0,0.690761,0.586937,-0.625,-0.9625,-0.925,0.9375,0.404266,0.053603,0.0
Qualität,Model,0.091891,0.1,0.196148,0.236639,0.0,0.325,0.075,0.0,0.6925,0.95,0.0,0.0,0.0,0.075,0.505937,0.251427,0.0
Korrektheit,Serie,0.046928,0.127416,0.046928,0.046928,0.046928,-0.000463,0.049433,0.046928,0.303478,-0.006015,0.046928,0.046928,0.046928,0.113185,0.051856,0.046928,0.046928
Korrektheit,Model,0.091357,0.015021,0.081224,0.041269,0.021213,0.01271,-0.000115,0.021213,0.112829,0.625566,0.075133,0.018476,0.030335,0.032575,0.013523,0.041269,0.021213


In [25]:
attr_codes_relevant = ["icecat_4860", "icecat_6767", "icecat_15767", "icecat_8778"]

In [28]:
pd.DataFrame(result)[attr_codes_relevant].rename(col_mapping, axis=1)

Unnamed: 0,Unnamed: 1,Desktop stand – icecat_4860,Maximum screen size – icecat_6767,Easy to apply – icecat_15767,Case type – icecat_8778
Stabilität,Serie,0.876314,0.818024,0.162243,0.855856
Stabilität,Model,0.911393,0.953707,0.251784,0.655805
Qualität,Serie,0.690761,0.586937,0.1,0.9375
Qualität,Model,0.6925,0.95,0.1,0.075
Korrektheit,Serie,0.303478,-0.006015,0.127416,0.113185
Korrektheit,Model,0.112829,0.625566,0.015021,0.032575


In [31]:
for attr in cache.attributes:
    if attr.code == "icecat_8778":
        print(attr.options)

[AttributeOption(code='348496963', labels={'en_US': 'Folio', 'en_GB': 'Folio', 'de_DE': 'Folio'}, attribute='icecat_8778', sort_order=0), AttributeOption(code='350637977', labels={'en_US': 'Flip case', 'en_GB': 'Flip case', 'de_DE': 'Flip case'}, attribute='icecat_8778', sort_order=2), AttributeOption(code='367987544', labels={'en_US': 'Cover', 'en_GB': 'Cover', 'de_DE': 'Cover'}, attribute='icecat_8778', sort_order=1)]


In [27]:
attr_df[attr_df["code"].isin(attr_codes_relevant)]

Unnamed: 0,code,labels,type,localizable,scopable,unique,group,group_labels,sort_order,allowed_extensions,...,max_file_size,metric_family,minimum_input_length,negative_allowed,number_min,number_max,reference_data_name,validation_rule,validation_regexp,wysiwyg_enabled
53,icecat_15767,"{'de_DE': 'Einfache Anwendung', 'en_GB': 'Easy...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",15767,[],...,,,,,,,,,,
349,icecat_4860,"{'de_DE': 'Desktop-Ständer', 'en_GB': 'Desktop...",AttributeType.BOOL,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",4860,[],...,,,,,,,,,,
365,icecat_6767,"{'de_DE': 'Maximale Bildschirmgröße', 'en_GB':...",AttributeType.METRIC,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",6767,[],...,,Angle,,False,,,,,,
422,icecat_8778,"{'de_DE': 'Etui-Typ', 'en_GB': 'Case type', 'e...",AttributeType.SELECT_SINGLE,True,True,False,features,"{'en_US': 'Features', 'en_GB': 'Features', 'de...",8778,[],...,,,,,,,,,,


In [29]:
dataset = ac.dataset_from_records(products_df[attr_codes_relevant].to_dict("records"))
calc_metrics(dataset)

{('Stabilität', 'Serie'): 0.6576955477535401,
 ('Stabilität', 'Model'): 0.85305580747738,
 ('Qualität', 'Serie'): 0.4209270169839284,
 ('Qualität', 'Model'): 0.7410499866039816,
 ('Korrektheit', 'Serie'): -0.02727411022928831,
 ('Korrektheit', 'Model'): 0.4684579339249058}

In [30]:
dataset = ac.dataset_from_records(products_df[["icecat_6767", "icecat_4860"]].to_dict("records"))
calc_metrics(dataset)

{('Stabilität', 'Serie'): 0.7609633137836843,
 ('Stabilität', 'Model'): 0.7892641435898878,
 ('Qualität', 'Serie'): 0.5576598594600688,
 ('Qualität', 'Model'): 0.5610081094508261,
 ('Korrektheit', 'Serie'): 0.11558408182093417,
 ('Korrektheit', 'Model'): 0.5212002231021134}