In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import adjusted_rand_score, silhouette_score

from src import akeneo, akeneo_clustering as ac, clustering, config

In [2]:
cache = akeneo.create_cache_from_env()

In [3]:
data_dir = config.dir_data / "clustering-results" / "3-both"
data_dir.mkdir(parents=True, exist_ok=True)

In [4]:
channel = "default"
lang = "en_US"

In [5]:
products = ac.parse_products(
    cache,
    attribute_types=ac.TYPES_NUMERICAL + ac.TYPES_CATEGORICAL + ac.TYPES_MULTI + ac.TYPES_TEXT,
)
products_multi_as_single = ac.transform_multi_to_single_cat(products)

products_df = pd.DataFrame(products)
products_multi_as_single_df = pd.DataFrame(products_multi_as_single)

In [6]:
duplicates: dict[str, list[int]] = {}
for index, categories in products_df.to_dict()[ac.KEY_CATEGORIES].items():
    category = categories[0]
    if category[:4] != "dup_":
        continue

    if category not in duplicates:
        duplicates[category] = []
    duplicates[category].append(index)
duplicates

{'dup_s20': [81, 82],
 'dup_s20_ultra': [89, 90],
 'dup_s21_128': [98, 99],
 'dup_s21_256': [100, 101],
 'dup_s21_plus': [104, 105],
 'dup_s21_ultra': [112, 113]}

In [7]:
labels_families = products_df[ac.KEY_FAMILY].to_numpy()
k_families = len(set(labels_families))
pd.Series(labels_families).value_counts(), f"k = {k_families}"

(mobile_phone_cases    80
 smartphones           42
 dtype: int64,
 'k = 2')

In [8]:
labels_generations = []
for _, family, categories in products_df[[ac.KEY_FAMILY, ac.KEY_CATEGORIES]].itertuples():
    labels_generations.append(f"{family[0]}_{categories[-1].split('_')[0]}")
labels_generations = np.array(labels_generations)

k_generations = len(set(labels_generations))
pd.Series(labels_generations).value_counts(), f"k = {k_generations}"

(m_s20    45
 m_s21    31
 s_s21    21
 s_s20    17
 m_s22     4
 s_s22     4
 dtype: int64,
 'k = 6')

In [9]:
labels_models = []
for _, family, categories in products_df[[ac.KEY_FAMILY, ac.KEY_CATEGORIES]].itertuples():
    labels_models.append(f"{family[0]}_{categories[-1]}")
labels_models = np.array(labels_models)

k_models = len(set(labels_models))
pd.Series(labels_models).value_counts(), f"k = {k_models}"

(m_s20          18
 m_s20_plus     14
 m_s21          11
 m_s20_ultra    11
 m_s21_plus     10
 s_s21           7
 s_s21_ultra     6
 s_s20_plus      5
 s_s20           5
 m_s21_ultra     5
 m_s21_fe        5
 s_s20_ultra     4
 s_s21_plus      4
 s_s21_fe        4
 s_s20_fe        3
 m_s22           2
 m_s20_fe        2
 s_s22           2
 m_s22_ultra     1
 m_s22_plus      1
 s_s22_plus      1
 s_s22_ultra     1
 dtype: int64,
 'k = 22')

In [10]:
labels_generations_shared = products_df[ac.KEY_CATEGORIES].map(lambda x: x[-1].split("_")[0]).to_numpy()
k_generations_shared = len(set(labels_generations_shared))
pd.Series(labels_generations_shared).value_counts(), f"k = {k_generations_shared}"

(s20    62
 s21    52
 s22     8
 dtype: int64,
 'k = 3')

In [11]:
labels_models_shared = products_df[ac.KEY_CATEGORIES].map(lambda x: x[-1]).to_numpy()
k_models_shared = len(set(labels_models_shared))
pd.Series(labels_models_shared).value_counts(), f"k = {k_models_shared}"

(s20          23
 s20_plus     19
 s21          18
 s20_ultra    15
 s21_plus     14
 s21_ultra    11
 s21_fe        9
 s20_fe        5
 s22           4
 s22_ultra     2
 s22_plus      2
 dtype: int64,
 'k = 11')

In [12]:
attr_metrics = {
    "code": [],
    "name": [],
    "type": [],
    "typeclass": [],
    "required-cases": [],
    "required-phones": [],
    "non-null": [],
    "unique": [],
}

attr_cases = list(filter(lambda f: f.code == "mobile_phone_cases", cache.families))[0].attributes
attr_cases_req = list(filter(lambda f: f.code == "mobile_phone_cases", cache.families))[0].attribute_requirements[channel]
attr_phones = list(filter(lambda f: f.code == "smartphones", cache.families))[0].attributes
attr_phones_req = list(filter(lambda f: f.code == "smartphones", cache.families))[0].attribute_requirements[channel]

for attr_code, attr in akeneo.Attribute.to_dict(cache.attributes).items():
    if attr_code not in products_df.columns:
        continue

    typeclass = (
        "numerical"
        if attr.type in ac.TYPES_NUMERICAL
        else "categorical"
        if attr.type in ac.TYPES_CATEGORICAL
        else "multi"
        if attr.type in ac.TYPES_MULTI
        else "string"
        if attr.type in ac.TYPES_TEXT
        else "unknown"
    )

    attr_metrics["code"].append(attr_code)
    attr_metrics["name"].append(attr.labels[lang])
    attr_metrics["type"].append(attr.type)
    attr_metrics["typeclass"].append(typeclass)
    attr_metrics["required-cases"].append(None if attr_code not in attr_cases else (attr_code in attr_cases_req))
    attr_metrics["required-phones"].append(None if attr_code not in attr_phones else (attr_code in attr_phones_req))
    attr_metrics["non-null"].append(products_df[attr_code].count())
    attr_metrics["unique"].append(products_df[attr_code].drop_duplicates().count())

attr_df = pd.DataFrame(attr_metrics)
attr_df.sort_values(["non-null", "unique"], ascending=[False, False])

Unnamed: 0,code,name,type,typeclass,required-cases,required-phones,non-null,unique
206,icecat_summary_short,Short Summary,AttributeType.TEXT,string,True,True,122,118
207,icecat_title,Title,AttributeType.TEXT,string,True,True,122,118
205,icecat_name,Name,AttributeType.TEXT,string,True,True,122,82
203,icecat_brand,Brand,AttributeType.TEXT,string,True,True,122,4
204,icecat_description_short,Short Description,AttributeType.TEXT,string,True,True,113,75
...,...,...,...,...,...,...,...,...
115,icecat_38332,Heart rate sensor,AttributeType.BOOL,categorical,,False,1,1
139,icecat_4463_fixed,Package type,AttributeType.SELECT_SINGLE,categorical,False,,1,1
142,icecat_4857,Battery weight,AttributeType.METRIC,numerical,,False,1,1
162,icecat_762,Package weight,AttributeType.METRIC,numerical,,False,1,1


In [13]:
attr_shared_df = attr_df[
    ~attr_df["required-cases"].isna() &
    ~attr_df["required-phones"].isna()
]
attr_shared_df

Unnamed: 0,code,name,type,typeclass,required-cases,required-phones,non-null,unique
17,icecat_1464,Height,AttributeType.METRIC,numerical,False,False,64,22
30,icecat_1649,Width,AttributeType.METRIC,numerical,False,False,64,17
31,icecat_1650,Depth,AttributeType.METRIC,numerical,False,False,64,18
196,icecat_94,Weight,AttributeType.METRIC,numerical,False,True,64,22
203,icecat_brand,Brand,AttributeType.TEXT,string,True,True,122,4
204,icecat_description_short,Short Description,AttributeType.TEXT,string,True,True,113,75
205,icecat_name,Name,AttributeType.TEXT,string,True,True,122,82
206,icecat_summary_short,Short Summary,AttributeType.TEXT,string,True,True,122,118
207,icecat_title,Title,AttributeType.TEXT,string,True,True,122,118


In [14]:
def _calc_duplicates_match(labels) -> float:
    result = 0.0
    n_duplicates = 0
    for _, indexes in duplicates.items():
        y1 = labels[indexes[0]]
        y2 = labels[indexes[1]]

        shared_clusters = len(y1.intersection(y2))
        all_clusters = len(y1.union(y2))

        result += (shared_clusters+1) / all_clusters
        n_duplicates += 1
    
    return result / n_duplicates

def calc_metrics(attr_codes, multi_as_single=False, attr_to_overweight: list[str] = None, factor = 2):
    dataset = (
        ac.dataset_from_records(products_df[attr_codes].to_dict("records"))
        if not multi_as_single
        else ac.dataset_from_records(products_multi_as_single_df[attr_codes].to_dict("records"))
    )
    prox_matrix = ac.calc_proximity_matrix(dataset)

    if attr_to_overweight is not None:
        dataset = ac.overweight_attributes(dataset, attr_to_overweight, factor)

    bik = clustering.BisectingKMeans(dataset, ac.Centroid, random_state=0)
    biks = [
        clustering.BisectingKMeans(dataset, ac.Centroid, random_state=i)
        for i in range(1, 11)
    ]

    stabilities = []
    for k in range(2, len(dataset) + 1):
        values = []
        for bikk in biks:
            values.append(
                adjusted_rand_score(
                    bik.labels_flat(k),
                    bikk.labels_flat(k),
                )
            )
        stabilities.append(np.array(values).mean())
    stabilities = np.array(stabilities)

    qualities = []
    for k in range(2, len(dataset)):
        qualities.append(silhouette_score(prox_matrix, bik.labels_flat(k), metric="precomputed"))
    qualities = np.array(qualities)

    return {
        "stabilities": stabilities,
        "qualities": qualities,
        "match_fam": adjusted_rand_score(labels_families, bik.labels_flat(k_families)),
        "match_gen": adjusted_rand_score(labels_generations, bik.labels_flat(k_generations)),
        "match_mod": adjusted_rand_score(labels_models, bik.labels_flat(k_models)),
        "match_gen_shared": adjusted_rand_score(labels_generations_shared, bik.labels_flat(k_generations_shared)),
        "match_mod_shared": adjusted_rand_score(labels_models_shared, bik.labels_flat(k_models_shared)),
        "match_dup": _calc_duplicates_match(bik.labels)
    }

In [15]:
def metrics_to_table(metric):
    result = {
        "Stability": metric["stabilities"].mean(),
        "Quality": metric["qualities"].mean(),
        "Match Families": metric["match_fam"],
        "Match Generations": metric["match_gen"],
        "Match Models": metric["match_mod"],
        "Match Generations Shared": metric["match_gen_shared"],
        "Match Models Shared": metric["match_mod_shared"],
        "Match Duplicates": metric["match_dup"],
    }
    return result

In [18]:
def attr_metrics_make():
    attr_metrics = {}
    for attr_code in attr_df["code"].to_list():
        m = calc_metrics([attr_code])
        attr_metrics[attr_code] = metrics_to_table(m)
    return attr_metrics
attr_metrics = config.load_or_create(data_dir / "attr-metrics.pkl", attr_metrics_make)

In [19]:
attr_metrics_df = pd.DataFrame(attr_metrics).transpose()
attr_metrics_df = attr_metrics_df.round(3) * 100
attr_metrics_df = pd.merge(attr_df, attr_metrics_df.reset_index().rename({"index": "code"}, axis=1), on="code")
attr_metrics_df

Unnamed: 0,code,name,type,typeclass,required-cases,required-phones,non-null,unique,Stability,Quality,Match Families,Match Generations,Match Models,Match Generations Shared,Match Models Shared,Match Duplicates
0,icecat_10157,Near Field Communication (NFC),AttributeType.BOOL,categorical,,False,40,1,40.9,23.0,93.4,41.1,9.4,2.0,0.6,75.0
1,icecat_1024_fixed,Speakers,AttributeType.SELECT_SINGLE,categorical,,False,36,1,35.3,21.7,80.9,34.0,9.4,1.0,0.9,75.0
2,icecat_10399,Stylus included,AttributeType.BOOL,categorical,,False,1,1,3.0,-3.2,1.5,1.1,0.6,0.2,1.2,66.7
3,icecat_10935,Face recognition,AttributeType.BOOL,categorical,,False,34,1,32.1,21.0,74.9,31.6,7.8,-1.0,-0.0,75.0
4,icecat_11379,Processor frequency,AttributeType.METRIC,numerical,,True,38,5,56.4,26.5,87.1,42.0,11.8,0.6,2.8,72.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,icecat_brand,Brand,AttributeType.TEXT,string,True,True,122,4,35.9,29.5,20.1,4.1,1.7,1.6,0.7,75.0
204,icecat_description_short,Short Description,AttributeType.TEXT,string,True,True,113,75,79.4,42.3,6.2,49.2,26.9,4.8,18.0,90.6
205,icecat_name,Name,AttributeType.TEXT,string,True,True,122,82,63.5,39.0,6.2,7.0,10.0,2.4,0.2,76.1
206,icecat_summary_short,Short Summary,AttributeType.TEXT,string,True,True,122,118,79.6,18.3,-0.8,71.1,55.0,2.7,24.4,95.8


In [30]:
case_to_attr = {
    "numerical": attr_df[attr_df["typeclass"]=="numerical"]["code"].to_list(),
    "categorical": attr_df[attr_df["typeclass"]=="categorical"]["code"].to_list(),
    "multi": attr_df[attr_df["typeclass"]=="multi"]["code"].to_list(),
    "string": attr_df[attr_df["typeclass"]=="string"]["code"].to_list(),
    "num+cat": attr_df[attr_df["typeclass"].isin(["numerical", "categorical"])]["code"].to_list(),
    "num+cat+mul": attr_df[attr_df["typeclass"].isin(["numerical", "categorical", "multi"])]["code"].to_list(),
    "num+cat+str": attr_df[attr_df["typeclass"].isin(["numerical", "categorical", "string"])]["code"].to_list(),
    "num+cat+mul+str": attr_df[attr_df["typeclass"].isin(["numerical", "categorical", "multi", "string"])]["code"].to_list(),
    "only shared": attr_shared_df["code"].to_list(),
}
pd.DataFrame([(key, value) for key, value in case_to_attr.items()])

Unnamed: 0,0,1
0,numerical,"[icecat_11379, icecat_12435_fixed, icecat_1243..."
1,categorical,"[icecat_10157, icecat_1024_fixed, icecat_10399..."
2,multi,"[icecat_13248_fixed, icecat_14695_fixed, iceca..."
3,string,"[icecat_12434, icecat_1585, icecat_1597, iceca..."
4,num+cat,"[icecat_10157, icecat_1024_fixed, icecat_10399..."
5,num+cat+mul,"[icecat_10157, icecat_1024_fixed, icecat_10399..."
6,num+cat+str,"[icecat_10157, icecat_1024_fixed, icecat_10399..."
7,num+cat+mul+str,"[icecat_10157, icecat_1024_fixed, icecat_10399..."
8,only shared,"[icecat_1464, icecat_1649, icecat_1650, icecat..."


In [33]:
attr_codes_intuitive_cases = ["icecat_6767", "icecat_title", "icecat_1464", "icecat_1649", "icecat_1650"]
attr_codes_intuitive_phones = [
    "icecat_13246_fixed",
    "icecat_944",
    "icecat_1464",
    "icecat_1649",
    "icecat_1650",
    "icecat_36912_fixed",
    "icecat_36910_fixed",
    "icecat_12437_fixed",
    "icecat_12435_fixed",
    "icecat_title",
    "icecat_brand",
    "icecat_3233",
    "icecat_40629",
    "icecat_75_fixed",
]
attr_codes_intuitive = list(set(attr_codes_intuitive_cases+attr_codes_intuitive_phones))
attr_codes_intuitive

['icecat_36912_fixed',
 'icecat_title',
 'icecat_brand',
 'icecat_944',
 'icecat_6767',
 'icecat_1649',
 'icecat_1464',
 'icecat_3233',
 'icecat_36910_fixed',
 'icecat_13246_fixed',
 'icecat_1650',
 'icecat_12435_fixed',
 'icecat_75_fixed',
 'icecat_12437_fixed',
 'icecat_40629']

In [34]:
def metrics_make():
    all_attr_codes = attr_df["code"].to_list()
    num_cat_attr_codes = attr_df[attr_df["typeclass"].isin(["numerical", "categorical"])]["code"].to_list()
    return {
        **{
            key: calc_metrics(value)
            for key, value in case_to_attr.items()
        },
        "only intuitive": calc_metrics(attr_codes_intuitive),
        "all & intuitive x2": calc_metrics(all_attr_codes, attr_to_overweight=attr_codes_intuitive, factor=2),
        "all & intuitive x3": calc_metrics(all_attr_codes, attr_to_overweight=attr_codes_intuitive, factor=3),
        "num+cat & intuitive x2": calc_metrics(num_cat_attr_codes, attr_to_overweight=attr_codes_intuitive, factor=2),
        "num+cat & intuitive x3": calc_metrics(num_cat_attr_codes, attr_to_overweight=attr_codes_intuitive, factor=3),
    }
metrics = config.load_or_create(data_dir / "metrics.pkl", metrics_make)

In [35]:
pd.DataFrame({n: metrics_to_table(m) for n, m in metrics.items()}).transpose().round(3) * 100

Unnamed: 0,Stability,Quality,Match Families,Match Generations,Match Models,Match Generations Shared,Match Models Shared,Match Duplicates
numerical,89.5,47.0,100.0,32.4,57.7,5.6,25.1,97.2
categorical,77.3,48.3,100.0,31.2,23.9,7.3,12.8,82.5
multi,83.2,48.5,-4.0,27.3,17.2,0.2,6.7,94.4
string,85.9,27.7,87.1,35.7,38.0,1.9,16.6,100.0
num+cat,96.4,50.0,100.0,47.7,40.3,3.3,16.4,96.7
num+cat+mul,92.5,45.2,100.0,40.7,39.6,6.5,16.7,96.7
num+cat+str,94.2,32.9,100.0,34.4,38.1,0.6,15.4,97.2
num+cat+mul+str,93.7,32.7,100.0,37.4,38.0,1.1,18.0,97.2
only shared,94.6,28.1,20.1,26.5,32.6,0.6,13.9,97.2
only intuitive,93.7,35.3,36.3,31.3,46.0,5.6,22.8,95.8


In [36]:
metrics_to_table(calc_metrics(attr_df[attr_df["typeclass"].isin(["numerical", "categorical"])]["code"].to_list()+["icecat_title"]))

{'Stability': 0.9627164685466899,
 'Quality': 0.35794956170862935,
 'Match Families': 1.0,
 'Match Generations': 0.45527654810792817,
 'Match Models': 0.41463855402106825,
 'Match Generations Shared': 0.033275673649832756,
 'Match Models Shared': 0.16785011796107213,
 'Match Duplicates': 0.9722222222222223}