# Testing Rolling Up

In [1]:
import pandas as pd

In [6]:
data = [{'category': 'a', 'data': 3}, {'category': 'b', 'data': 3}, {'category': 'c', 'data': 1}, {'category': 'd', 'data': 3}, {'category': 'a', 'data': 2}]
df = pd.DataFrame(data)

In [7]:
df

Unnamed: 0,category,data
0,a,3
1,b,3
2,c,1
3,d,3
4,a,2


In [21]:
replacements = [{'category': 'a', 'parent': 'e'}, {'category': 'c', 'parent': 'd'}]
for replacement in replacements:
    df.loc[df.category == replacement['category'], 'category'] = replacement['parent']

In [22]:
df

Unnamed: 0,category,data
0,e,3
1,b,3
2,d,1
3,d,3
4,e,2


In [25]:
len(df['category'].value_counts()[lambda x: x < 1].index)

0

# Looking at category DF

In [26]:
import xml.etree.ElementTree as ET
categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
root_category_id = 'cat00000'
tree = ET.parse(categories_file_name)
root = tree.getroot()

In [27]:
# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [28]:
parents_df

Unnamed: 0,category,parent
0,abcat0010000,cat00000
1,abcat0011000,abcat0010000
2,abcat0011001,abcat0011000
3,abcat0011002,abcat0011000
4,abcat0011003,abcat0011000
...,...,...
4634,pcmcat97200050013,cat15205
4635,pcmcat97200050015,cat15063
4636,pcmcat99000050001,pcmcat50000050006
4637,pcmcat99000050002,pcmcat99000050001


In [66]:
parents_df[parents_df.category == 'abcat0011003'].parent.iloc[0]

'abcat0011000'

# Normalizing Test

In [81]:
import re
in_data = [{'category': 'a', 'data': 'test'}, {'category': 'b', 'data': 'test2'}, {'category': 'c', 'data': 'test3'}, {'category': 'd', 'data': 'test45'}, {'category': 'a', 'data': 'test.5'}]
df1 = pd.DataFrame(in_data)

In [100]:
def normalize_query(row):
    query = row.data
    query = query.lower()
    query = re.sub(r'[^\w\s]', ' ', query)
    row.data = query
    return row

In [101]:
df1.apply(normalize_query, axis=1)

Unnamed: 0,category,data
0,a,test
1,b,test2
2,c,test3
3,d,test45
4,a,test 5


# Results

In [10]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

In [106]:
from collections import defaultdict
base_file = '/workspace/datasets/labeled_query_data_mq_{}.shuffled'
min_queries = [1, 100, 1000]
for min_query in min_queries:
    labels = defaultdict(int)
    with open(base_file.format(min_query), 'r') as f:
        lines = f.readlines()
        for line in lines:
            labels[line.split(' ')[0]] += 1
    print(f'Min Queries: {min_query} Categories {len(labels.keys())}')

Min Queries: 1 Categories 1486
Min Queries: 100 Categories 866
Min Queries: 1000 Categories 374


| Minumum Queries | Number of Categories|
|:--------- | :-------------------|
| 1 | 1483 |
| 100 | 866 |
| 1000 | 374 |

In [2]:
store = pd.HDFStore('/workspace/datasets/training_results.h5')
store.df

Unnamed: 0,Minimum Queries,Epochs,Learning Rate,wordNgrams,Recs@1,P@1,R@1,Recs@3,P@3,R@3,Recs@5,P@5,R@5
0,1,5,0.40,1,49818,0.511542,0.511542,49818,0.228773,0.686318,49818,0.149882,0.749408
1,1,5,0.40,2,49818,0.514071,0.514071,49818,0.229803,0.689409,49818,0.150536,0.752680
2,1,5,0.35,1,49818,0.515838,0.515838,49818,0.230071,0.690212,49818,0.150745,0.753724
3,1,5,0.35,2,49818,0.510639,0.510639,49818,0.228806,0.686419,49818,0.149601,0.748003
4,1,5,0.25,1,49818,0.510097,0.510097,49818,0.227180,0.681541,49818,0.148488,0.742442
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1000,25,0.25,2,50000,0.517280,0.517280,50000,0.234553,0.703660,50000,0.154100,0.770500
146,1000,25,0.15,1,50000,0.519560,0.519560,50000,0.234260,0.702780,50000,0.153024,0.765120
147,1000,25,0.15,2,50000,0.520620,0.520620,50000,0.234827,0.704480,50000,0.153952,0.769760
148,1000,25,0.05,1,50000,0.520160,0.520160,50000,0.234107,0.702320,50000,0.153324,0.766620


In [9]:
df = store.df
best_at_1 = df.sort_values('P@1', ascending=False).iloc[0]
best_at_3 = df.sort_values('P@3', ascending=False).iloc[0]
best_at_5 = df.sort_values('P@5', ascending=False).iloc[0]
df = pd.DataFrame([best_at_1, best_at_3, best_at_5])
df

Unnamed: 0,Minimum Queries,Epochs,Learning Rate,wordNgrams,Recs@1,P@1,R@1,Recs@3,P@3,R@3,Recs@5,P@5,R@5
115,1000.0,10.0,0.25,2.0,50000.0,0.5238,0.5238,50000.0,0.23504,0.70512,50000.0,0.154352,0.77176
125,1000.0,15.0,0.25,2.0,50000.0,0.51912,0.51912,50000.0,0.235707,0.70712,50000.0,0.15438,0.7719
125,1000.0,15.0,0.25,2.0,50000.0,0.51912,0.51912,50000.0,0.235707,0.70712,50000.0,0.15438,0.7719


In [10]:
store.close()

# Conclusion
I am going to go with the minimum queries of 1000, with 15 epochs, a learning rate of .25, and wordNgrams of 2. This gives the best at 3 and 5, as well as only being a little worse than the best at 1. 