In [28]:
import pandas as pd

import sys
import os
import xml.etree.ElementTree as ET
from pathlib import Path


def get_child_to_parent():
    categoriesFilename = '/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
    root_category_id = 'cat00000'
    tree = ET.parse(categoriesFilename)
    root = tree.getroot()

    # Parse the category XML file to map each category id to its parent category id in a dataframe.
    categories = []
    parents = []
    for child in root:
        id = child.find('id').text
        cat_path = child.find('path')
        cat_path_ids = [cat.find('id').text for cat in cat_path]
        leaf_id = cat_path_ids[-1]
        if leaf_id != root_category_id:
            categories.append(leaf_id)
            parents.append(cat_path_ids[-2])
    parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])
    child_to_parent = parents_df.set_index('category')
    return child_to_parent


def get_cat_lookup(max_depth=10):
    categoriesFilename = '/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
    tree = ET.parse(categoriesFilename)
    root = tree.getroot()    
    catDict = {}
    for child in root:
        catPath = child.find('path')
        leafCat = catPath[-1].find('id').text
        catPathStr = ''
        depth = 0
        for cat in catPath:
            if catPathStr != '':
                catPathStr = catPathStr + ' > '
            catPathStr = catPathStr + cat.find('name').text
            depth = depth + 1
            if max_depth > 0 and depth == max_depth:
                break
        catDict[leafCat] = catPathStr
    return catDict
            
child_to_parent = get_child_to_parent()     
cat_lookup = get_cat_lookup()


In [29]:
queries = pd.read_csv('/workspace/datasets/train.csv').sample(10_000)
queries['path'] = queries['category'].map(cat_lookup)
queries = queries.set_index('category').drop(['sku', 'user', 'query_time'], axis=1)
queries = queries[~queries.path.isna()].copy()

queries['path_length'] = queries['path'].map(lambda x: len(x.split('>')))
sizes = queries.groupby('category').size()
queries['leaf_counts'] = queries.index.map(sizes)

queries.sample(10)

Unnamed: 0_level_0,query,click_time,path,path_length,leaf_counts
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abcat0706002,OnlineMidnightSale_Gaming,2011-09-13 21:07:45.029,Best Buy > Video Games > Wii > Wii Games,4,59
abcat0301014,Gps,2011-10-29 03:00:02.973,"Best Buy > Car, Marine & GPS > GPS Navigation ...",4,60
pcmcat209000050008,LaborDay_CameraCamcorder_20110902,2011-09-03 07:35:42.635,Best Buy > Computers & Tablets > Tablets & iPa...,4,367
abcat0101001,55 LCD HDTV,2011-10-18 19:37:55.518,Best Buy > TV & Home Theater > TVs > All Flat-...,4,431
pcmcat168400050038,Zune,2011-10-05 15:47:12.509,Best Buy > Movies & Music > Digital Music > Zu...,4,2
cat02015,high school musical,2011-10-19 23:35:16.395,Best Buy > Movies & Music > Movies & TV Shows,3,963
cat02009,s video cable,2011-09-21 22:47:10.493,Best Buy > Movies & Music > Music > Pop,4,129
abcat0101001,led tvs,2011-10-20 10:44:49.383,Best Buy > TV & Home Theater > TVs > All Flat-...,4,431
abcat0913004,safes,2011-10-18 14:57:35.979,Best Buy > Home > Home Security & Safety > Safes,4,1
abcat0715016,Ds lite,2011-08-16 16:14:41.929,Best Buy > Video Games > Nintendo DS > Nintend...,4,18


In [39]:
def get_node_parent(cat_id, mapping_df, category_sizes_mapping):
    try:
        return mapping_df.loc[cat_id].parent
    except KeyError:
        return 'IsParent'

queries['degree_1_parent'] = queries.index.map(lambda x: get_node_parent(x, child_to_parent) )
queries['degree_2_parent'] = queries.degree_1_parent.map(lambda x: get_node_parent(x, child_to_parent))

In [43]:
sizes

category
abcat0101001          431
abcat0101002            5
abcat0101005            5
abcat0102003           40
abcat0102005            9
                     ... 
pcmcat254000050005     16
pcmcat254000050007      1
pcmcat254000050008      1
pcmcat254000050009      2
pcmcat254300050006      1
Length: 809, dtype: int64

In [27]:
# How many categories have no descendants?


Unnamed: 0_level_0,parent
category,Unnamed: 1_level_1
abcat0010000,cat00000
abcat0011000,abcat0010000
abcat0011001,abcat0011000
abcat0011002,abcat0011000
abcat0011003,abcat0011000
...,...
pcmcat97200050013,cat15205
pcmcat97200050015,cat15063
pcmcat99000050001,pcmcat50000050006
pcmcat99000050002,pcmcat99000050001


In [4]:
# !head /workspace/datasets/train.csv\
#     | cut -d',' -f3 | \
#     python leavesToPaths.py --max_depth 6

In [None]:
# !grep touchpad /workspace/datasets/train.csv\
#     | cut -d',' -f3 | \
#     python leavesToPaths.py --max_depth 4 | sort | uniq -c | sort -nr | head