# Icecat Taxonomy

Icecat defines a taxonomy per product category. The dataset should contain products from the category "Mobile Phone Cases" and "Smartphones". So, a look on the Icecat Taxonomy for both categories has to be taken now.

Both taxonomies were downloaded from Icecat as Excel files to the data directory.


In [1]:
import pandas as pd

from src import config

## Loading the Taxonomy Files


In [2]:
dir_data_src = config.dir_data / "icecat-taxonomy"
dir_data_result = config.dir_data / "dataset"

In [3]:
taxonomy_cases = pd.read_excel(dir_data_src / "mobile_phone_cases.xlsx")
taxonomy_cases

Unnamed: 0,Catid,UNSPSC,Global feature group ID,Feature group,Feature_id,Feature,Is mandatory,Is searchable,Type,Restricted values,Measure sign
0,2779,43191691,14,Weight & dimensions,1649,Width,0,0,numerical,,mm
1,2779,43191691,14,Weight & dimensions,1650,Depth,0,0,numerical,,mm
2,2779,43191691,14,Weight & dimensions,1464,Height,0,0,numerical,,mm
3,2779,43191691,14,Weight & dimensions,94,Weight,0,0,numerical,,g
4,2779,43191691,14,Weight & dimensions,2646,Interior dimensions (W x D x H),0,0,3d,,mm
...,...,...,...,...,...,...,...,...,...,...,...
63,2779,43191691,185,Features,1766,Product colour,1,1,multi_dropdown,Aluminium\nAnthracite\nAssorted colours\nBeige...,
64,2779,43191691,185,Features,898,Material,1,1,multi_dropdown,Aramide\nABS synthetics\nABS\nAcrylic\nEgg she...,
65,2779,43191691,185,Features,8156,Brand compatibility,1,1,dropdown,Apple\nSamsung\nSony\nHTC\nLG\nUniversal\nNoki...,
66,2779,43191691,185,Features,6767,Maximum screen size,1,1,numerical,,""""


In [4]:
taxonomy_phones = pd.read_excel(dir_data_src / "smartphones.xlsx")
taxonomy_phones

Unnamed: 0,Catid,UNSPSC,Global feature group ID,Feature group,Feature_id,Feature,Is mandatory,Is searchable,Type,Restricted values,Measure sign
0,1893,43191528,14,Weight & dimensions,1649,Width,0,0,numerical,,mm
1,1893,43191528,14,Weight & dimensions,1650,Depth,0,0,numerical,,mm
2,1893,43191528,14,Weight & dimensions,1464,Height,0,0,numerical,,mm
3,1893,43191528,47,Storage,730,Compatible memory cards,1,0,multi_dropdown,CF\nCF+\nCF Type II\nexpressP2\nMemory Stick (...,
4,1893,43191528,47,Storage,7021,Maximum memory card size,0,0,numerical,,GB
...,...,...,...,...,...,...,...,...,...,...,...
349,1893,43191528,54,Navigation,26425,Position location,1,1,y_n,,
350,1893,43191528,95,Camera,27587,Rear camera type,1,1,dropdown,Single camera\nDual camera\nTriple camera\nQua...,
351,1893,43191528,47,Storage,7861,RAM capacity,1,1,numerical,,GB
352,1893,43191528,90,Software,75,Platform,1,1,dropdown,Android\nAsha\nBada\nBlackBerry\nFirefox\niOS\...,


## Analysis of the Columns


In [5]:
pd.merge(taxonomy_cases, taxonomy_phones, how="outer", on="Feature_id")

Unnamed: 0,Catid_x,UNSPSC_x,Global feature group ID_x,Feature group_x,Feature_id,Feature_x,Is mandatory_x,Is searchable_x,Type_x,Restricted values_x,...,Catid_y,UNSPSC_y,Global feature group ID_y,Feature group_y,Feature_y,Is mandatory_y,Is searchable_y,Type_y,Restricted values_y,Measure sign_y
0,2779.0,43191691.0,14.0,Weight & dimensions,1649,Width,0.0,0.0,numerical,,...,1893.0,43191528.0,14.0,Weight & dimensions,Width,0.0,0.0,numerical,,mm
1,2779.0,43191691.0,14.0,Weight & dimensions,1650,Depth,0.0,0.0,numerical,,...,1893.0,43191528.0,14.0,Weight & dimensions,Depth,0.0,0.0,numerical,,mm
2,2779.0,43191691.0,14.0,Weight & dimensions,1464,Height,0.0,0.0,numerical,,...,1893.0,43191528.0,14.0,Weight & dimensions,Height,0.0,0.0,numerical,,mm
3,2779.0,43191691.0,14.0,Weight & dimensions,94,Weight,0.0,0.0,numerical,,...,1893.0,43191528.0,14.0,Weight & dimensions,Weight,1.0,1.0,numerical,,g
4,2779.0,43191691.0,14.0,Weight & dimensions,2646,Interior dimensions (W x D x H),0.0,0.0,3d,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,,,,,26425,,,,,,...,1893.0,43191528.0,54.0,Navigation,Position location,1.0,1.0,y_n,,
386,,,,,27587,,,,,,...,1893.0,43191528.0,95.0,Camera,Rear camera type,1.0,1.0,dropdown,Single camera\nDual camera\nTriple camera\nQua...,
387,,,,,7861,,,,,,...,1893.0,43191528.0,47.0,Storage,RAM capacity,1.0,1.0,numerical,,GB
388,,,,,75,,,,,,...,1893.0,43191528.0,90.0,Software,Platform,1.0,1.0,dropdown,Android\nAsha\nBada\nBlackBerry\nFirefox\niOS\...,


In [6]:
cols = ["Feature", "Feature group", "Type", "Restricted values", "Measure sign"]
num_of_entries = []

for col in cols:
    merged_df = pd.merge(taxonomy_cases, taxonomy_phones, "outer", ["Feature_id", col])
    num_of_entries.append(len(merged_df))

pd.DataFrame({"merge on": cols, "num of entries": num_of_entries})


Unnamed: 0,merge on,num of entries
0,Feature,390
1,Feature group,403
2,Type,391
3,Restricted values,391
4,Measure sign,390


In [7]:
cols = ["Feature_id", "Feature", "Feature group"]

merged_df = pd.merge(
    taxonomy_cases[cols],
    taxonomy_phones[cols],
    "outer",
    ["Feature_id", "Feature"],
    suffixes=["_cases", "_phones"],
).dropna()

merged_df[merged_df["Feature group_cases"] != merged_df["Feature group_phones"]]

Unnamed: 0,Feature_id,Feature,Feature group_cases,Feature group_phones
6,8801,Screen protector,Packaging data,Packaging content
7,3808,Package width,Packaging data,Weight & dimensions
8,3806,Package depth,Packaging data,Weight & dimensions
9,3807,Package height,Packaging data,Weight & dimensions
10,762,Package weight,Packaging data,Weight & dimensions
12,8072,Protection features,Features,Design
14,909,Battery capacity,Features,Battery
18,3294,Country of origin,Features,Performance
30,7900,International Protection (IP) code,Features,Design
33,12148,Wireless charging,Features,Performance


In [8]:
cols = ["Feature_id", "Feature", "Type"]

merged_df = pd.merge(
    taxonomy_cases[cols],
    taxonomy_phones[cols],
    "outer",
    ["Feature_id", "Feature"],
    suffixes=["_cases", "_phones"],
).dropna()

merged_df[merged_df["Type_cases"] != merged_df["Type_phones"]]

Unnamed: 0,Feature_id,Feature,Type_cases,Type_phones
12,8072,Protection features,dropdown,multi_dropdown


## Compile Result


### Attribute Groups

In [9]:
groups_df = pd.merge(
    taxonomy_cases["Feature group"],
    taxonomy_phones["Feature group"],
    "outer",
).drop_duplicates().rename({"Feature group": "locale_en"}, axis=1)

groups_df["code"] = (
    groups_df["locale_en"].str.replace("&", "and").str.replace(" ", "_").str.lower()
)

groups_df = groups_df.sort_index(axis=1).sort_values("code")
groups_df

Unnamed: 0,code,locale_en
524,battery,Battery
392,camera,Camera
676,certificates,Certificates
691,design,Design
608,display,Display
86,features,Features
116,logistics_data,Logistics data
587,messaging,Messaging
546,multimedia,Multimedia
515,navigation,Navigation


In [10]:
groups_df.to_csv(dir_data_result / "attribute-groups.csv", index=False)

### Attributes & Families

In [11]:
cols = ["Feature_id", "Feature", "Feature group", "Type", "Is mandatory"]

merged_df = pd.merge(
    taxonomy_cases[cols],
    taxonomy_phones[cols],
    "outer",
    cols[0:-1],
).drop_duplicates(cols[0], keep="last") # prefer group and type from `taxonomy_phones`

merged_df

Unnamed: 0,Feature_id,Feature,Feature group,Type,Is mandatory_x,Is mandatory_y
0,1649,Width,Weight & dimensions,numerical,0.0,0.0
1,1650,Depth,Weight & dimensions,numerical,0.0,0.0
2,1464,Height,Weight & dimensions,numerical,0.0,0.0
3,94,Weight,Weight & dimensions,numerical,0.0,1.0
4,2646,Interior dimensions (W x D x H),Weight & dimensions,3d,0.0,
...,...,...,...,...,...,...
398,26425,Position location,Navigation,y_n,,1.0
399,27587,Rear camera type,Camera,dropdown,,1.0
400,7861,RAM capacity,Storage,numerical,,1.0
401,75,Platform,Software,dropdown,,1.0


In [12]:
(merged_df["Feature_id"] < 2).drop_duplicates()

0    False
Name: Feature_id, dtype: bool

In [13]:
map_col_names = {
    "Feature_id": "code",
    "Feature": "locale_en",
    "Feature group": "group",
    "Type": "type",
    "Is mandatory_x": "mobile_phone_cases",
    "Is mandatory_y": "smartphones",
}
map_values = {
    0.0: "optional",
    1.0: "required",
}

renamed_df = merged_df.rename(map_col_names, axis=1).replace(map_values)
renamed_df["group"] = (
    renamed_df["group"].str.replace("&", "and").str.replace(" ", "_").str.lower()
)

attributes_df = renamed_df.sort_values("code")
attributes_df

Unnamed: 0,code,locale_en,group,type,mobile_phone_cases,smartphones
102,34,Contrast ratio (typical),display,contrast ratio,,optional
91,47,Processor model,processor,dropdown,,required
112,48,Digital zoom,camera,numerical,,optional
212,74,Optical zoom,camera,numerical,,optional
401,75,Platform,software,dropdown,,required
...,...,...,...,...,...,...
382,44158,RAM expansion (max),storage,numerical,,optional
367,44188,AI scene recognition,camera,y_n,,optional
385,44243,Fourth rear camera pixel size,camera,numerical,,optional
386,44244,Fourth rear camera field of view (FOV) angle,camera,numerical,,optional


In [14]:
attributes_df.to_csv(dir_data_result / "attributes-icecat.csv", index=False)