In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

from src import akeneo, akeneo_clustering as ac, clustering

In [2]:
cache = akeneo.create_cache_from_env()

In [4]:
attr_df = pd.DataFrame(cache.attributes)
attr_df["typeclass"] = attr_df["type"].map(ac.map_to_attribute_kind)

In [6]:
attr_df[
    (attr_df["group"] != "faulty") &
    #attr_df["code"].isin(cache.families[0].attributes) &
    attr_df["code"].isin(cache.families[0].attribute_requirements["default"])
][["code", "typeclass"]].groupby("typeclass").count()

Unnamed: 0_level_0,code
typeclass,Unnamed: 1_level_1
categorical,2
multi-categorical,1
numerical,1
other,5
string,5


In [3]:
products_df = pd.DataFrame(ac.parse_products(cache))
products_df

Unnamed: 0,__id__,__family__,__categories__,icecat_8156,icecat_8411,icecat_8778,icecat_27575_fixed,icecat_image_0,icecat_name,icecat_brand,...,icecat_1733,icecat_9040,icecat_28715,icecat_42370_fixed,icecat_42371_fixed,icecat_42372_fixed,icecat_1535,icecat_4857,icecat_10399,icecat_38332
0,104889,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,0/a/9/1/0a91f00d0f19027c87a3d5cfa12ca2b3ed6f16...,{104889},{mobipart},...,,,,,,,,,,
1,MP-104927,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,2/6/c/9/26c9db2234b5453445df17e2b5595abc38402c...,{mp-104927},{mobipart},...,,,,,,,,,,
2,MP-104925,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,0/e/f/3/0ef3327abba69987cb51115bbd6cae26555339...,{mp-104925},{mobipart},...,,,,,,,,,,
3,MP-108422,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,e/c/1/9/ec19013b1c5c35f111bfc0e778a6f4727d321c...,{mp-108422},{mobipart},...,,,,,,,,,,
4,MP-104926,mobile_phone_cases,[s20],348496968,348496965,367987544,not_supported,d/3/8/e/d38e2b463817add5225cb6c41bf8bbfc4971df...,{mp-104926},{mobipart},...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,SM-G990BZAGEUD,smartphones,[s21_fe],,,,,e/6/4/e/e64edd88b4502b9eaa2ff6c96ffc23611c05e2...,{sm-g990b},{samsung},...,,,,,,,,,,
118,SM-S901BZADEUE,smartphones,[s22],,,,,e/a/d/a/eada1f1d64c074e80a2d8dd9b80146de4a142d...,{sm-s901b/ds},{samsung},...,,,,,,,,,,
119,SM-S901BZKDEUE,smartphones,[s22],,,,,c/d/6/a/cd6a60a8bc5b3f09c8630f03b3686590cfd619...,{sm-s901b},{samsung},...,True,True,True,0.0,0.0,0.0,,,,
120,SM-S906BZGDEUE,smartphones,[s22_plus],,,,,3/c/e/4/3ce4cbb1cf481ec0d10f682d146b0e9f51ddcd...,{sm-s906b},{samsung},...,True,True,True,0.0,0.0,0.0,,,,


In [5]:
dataset = ac.dataset_from_records(products_df.to_dict("records"))
bik = clustering.BisectingKMeans(dataset, ac.Centroid)

In [7]:
labels_k_11 = bik.labels_flat(11)

In [20]:
products = ac.parse_products(cache)
ac.transform_multi_to_single_cat(products)

[{'__id__': '104889',
  '__family__': 'mobile_phone_cases',
  '__categories__': ['s20'],
  'icecat_8156': '348496968',
  'icecat_8411': '348496965',
  'icecat_8778': '367987544',
  'icecat_27575_fixed': 'not_supported',
  'icecat_image_0': '0/a/9/1/0a91f00d0f19027c87a3d5cfa12ca2b3ed6f16b0_80687897_2308647073.jpg',
  'icecat_name': '104889',
  'icecat_brand': 'mobipart',
  'icecat_title': '104889,6.2,black,case,cover,mobil,mobipart,phone',
  'icecat_summary_short': '104889,6.2,black,cover,galaxi,mobipart,s20,samsung',
  'icecat_description_short': 'black,case,galaxi,grip,mobipart,rug,s20,samsung,tough',
  'icecat_890': 'Galaxy S20',
  'icecat_summary': 'Mobiparts 104889. Case type: Cover, Brand compatibility: Samsung, Compatibility: Galaxy S20, Maximum screen size: 6.2", Surface coloration: Monotone, Product color: Black',
  'icecat_description': 'Features: Mobiparts Rugged collection Maximum drop- and impact protection Drop tested impact protection Two-layered design Soft and flexible 

In [None]:
pd.factorize(products_df["icecat_brand"].map(lambda x: ",".join(list(x))))

In [17]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Columns: 215 entries, __id__ to icecat_38332
dtypes: float64(58), object(157)
memory usage: 205.0+ KB


In [16]:
le = LabelEncoder()

le.fit_transform(products_df["icecat_brand"].map(lambda x: ",".join(list(x))))

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [10]:
mutual_info_classif(products_df, labels_k_11)

ValueError: could not convert string to float: 'MP-104927'