In [204]:
import pandas as pd
import json
import hashlib
from tqdm import tqdm
import xlsxwriter
from xlsxwriter.utility import xl_rowcol_to_cell


In [205]:
from collections import OrderedDict
from copy import deepcopy 

def sortOD(od):
    res = OrderedDict()
    for k, v in sorted(od.items()):
        if isinstance(v, dict):
            res[k] = sortOD(v)
        else:
            res[k] = deepcopy(v)
    return res

In [206]:
l2s = []
with open('2023_q1_top_25_l2s.txt', 'r') as f:
    for l in f:
        if len(l.replace('\n', '').strip()) > 0:
            l2s.append(l.replace('\n', '').strip())

In [207]:
df_meta = pd.read_csv("Initial Attribute Definition for First Release - UPDATED SHEET .csv")

In [208]:
df_meta_25l2s = df_meta[df_meta.category.apply(lambda x: any([x.startswith(i) for i in l2s]))]

In [209]:
len(set(df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))))

25

In [210]:
df_meta_25l2s['L2'] = df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_25l2s['L2'] = df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))


In [211]:
df_meta_25l2s.groupby('L2').agg({
    'attribute_field': lambda x: len(set([i for i in x]))
})

Unnamed: 0_level_0,attribute_field
L2,Unnamed: 1_level_1
Cellphones & Telecommunications > Mobile Phone Accessories,38
Cellphones & Telecommunications > Phone Bags & Cases,14
Consumer Electronics > Earphones & Headphones,25
"Home & Garden > Arts, Crafts & Sewing",43
Home & Garden > Festive & Party Supplies,52
Home & Garden > Garden Supplies,41
Home & Garden > Home Decor,69
Home & Garden > Home Storage & Organization,35
Home & Garden > Home Textile,51
Home & Garden > Household Merchandise,33


In [212]:
df_meta_25l2s_nofreetext = df_meta_25l2s[df_meta_25l2s['entry mode'] != 'free_text']

In [213]:
df_meta_25l2s_nofreetext.groupby('L2').agg({
    'attribute_field': lambda x: len(set([i for i in x]))
})

Unnamed: 0_level_0,attribute_field
L2,Unnamed: 1_level_1
Cellphones & Telecommunications > Mobile Phone Accessories,32
Cellphones & Telecommunications > Phone Bags & Cases,13
Consumer Electronics > Earphones & Headphones,24
"Home & Garden > Arts, Crafts & Sewing",24
Home & Garden > Festive & Party Supplies,40
Home & Garden > Garden Supplies,25
Home & Garden > Home Decor,52
Home & Garden > Home Storage & Organization,15
Home & Garden > Home Textile,37
Home & Garden > Household Merchandise,21


In [214]:
df_query = pd.read_csv('../query_attr_extract_label/allstratified_sample_4806_query.csv')
df_product = pd.read_csv('../wish_attr_extract_label/sdt887_product_attribution_data_one_listing_per_leaf_node__20230111.csv')

In [215]:
meta_dict = {}
for i in set(df_meta.category):
    meta_dict[i] = df_meta[(df_meta.category == i) & (df_meta['entry mode'] != 'free_text')].sort_values(
        "attribute_field"
    ).to_dict('records')

# product

In [216]:
recs = []
for i in tqdm(df_product.to_dict('records')):
    if i['category_path'] in meta_dict:
        for j in meta_dict[i['category_path']]:
            rec = sortOD({
                "category_path": i['category_path'],
                "product_id": i['product_id'],
                "title": i['title'],
                "product_description": i['product_description'],
                "main_image_url": i['main_image_url'],
                "attribute_field": j['attribute_field'],
                "attribute_value": j['category_attributevalue'],
                "entry_mode": j['entry mode'],
                "max_multi_select": str(j['max_multi_select']),
                "attribute_description": j['description']
            })
            task_json = json.dumps(rec).encode('utf-8')
            task_hash = hashlib.md5(task_json).hexdigest()
            task_id = f"product_attribution_{task_hash}"
            rec['task_id'] = task_id
            recs.append(rec)

100%|██████████| 5033/5033 [00:00<00:00, 7531.98it/s]


In [217]:
tmp_df = pd.DataFrame(recs)

In [218]:
assert len(tmp_df) == len(set(tmp_df.task_id))

In [219]:
len(set(tmp_df['product_id'])) / len(df_product)

0.37711106695807667

In [220]:
len(tmp_df) / len(set(tmp_df['product_id']))

8.133298208640674

In [221]:

attr_val_set_to_cell_range = {}

workbook = xlsxwriter.Workbook('../wish_attr_extract_label/offshore_excel/sdt887_product_attribution_data_one_listing_per_leaf_node__20230111_25l2subset_offshoreexcel_20230119.xlsx')
worksheet = workbook.add_worksheet('attribute_val')
worksheet2 = workbook.add_worksheet('attribute_definition')

r = 0
for i in set(tmp_df['attribute_value']):
    c = 0
    start_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    for j in eval(i):
        worksheet2.write(r, c, j)
        end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
        c += 1
    attr_val_set_to_cell_range[i] = (start_cell, end_cell)
    r += 1


cols = ['task_id', 'title', 'product_description', 'main_image_url', 
    'max_multi_select', 'entry_mode', 'attribute_field', 'attribute_value']

r = 0
for c in range(len(cols)):
    worksheet.write(r, c, cols[c])

r += 1
for i in tqdm(tmp_df.to_dict('records')):
    for c in range(len(cols)):
        if cols[c] != 'attribute_value':
            worksheet.write(r, c, i[cols[c]])
        else:
            start_cell, end_cell = attr_val_set_to_cell_range[i[cols[c]]]
            worksheet.data_validation(r, c, r, c, {
                'validate': 'list',
                'source': f'=attribute_definition!{start_cell}:{end_cell}',
                'input_message': i['attribute_description']
            })
    r += 1
workbook.close()

100%|██████████| 15437/15437 [00:01<00:00, 9128.72it/s]


# query

In [222]:
df_query

Unnamed: 0,label_ordering,query,sample_method,top_query_classification_taxonomy
0,2308,hyundai elantra accessories,head,Automobiles & Motorcycles > Car Electronics > ...
1,2310,peine para perros,head,Home & Garden > Pet Products > Dog Grooming > ...
2,2316,mens boots clearance,head,Shoes > Men's Shoes > Men's Boots > Basic Boots
3,2318,garbage pail kids,head,Toys & Hobbies > Puzzles & Games > Games > Car...
4,2320,luces de piscina,uniform,Home Improvement > Lights & Lighting > Outdoor...
...,...,...,...,...
4801,99925,spray gun paint,head,Home Improvement > Painting Supplies & Wall Tr...
4802,99947,kurta set for women,head,Novelty & Special Use > Traditional & Cultural...
4803,99964,strap on harnesses,head,Mother & Kids > Activity & Gear > Harnesses & ...
4804,99972,coleira para gato,head,Home & Garden > Pet Products > Cat Supplies > ...


In [223]:
recs = []
for i in tqdm(df_query.to_dict('records')):
    path_i_tup = tuple(i['top_query_classification_taxonomy'].split(' > '))
    for path in meta_dict:
        if tuple(path.split(' > '))[:len(path_i_tup)] == path_i_tup:
            for j in meta_dict[path]:
                rec = sortOD({
                    "category_path": i['top_query_classification_taxonomy'],
                    "query": i['query'],
                    "attribute_field": j['attribute_field'],
                    "attribute_value": j['category_attributevalue'],
                    "entry_mode": j['entry mode'],
                    "max_multi_select": str(j['max_multi_select']),
                    "attribute_description": j['description']
                })
                task_json = json.dumps(rec).encode('utf-8')
                task_hash = hashlib.md5(task_json).hexdigest()
                task_id = f"product_attribution_{task_hash}"
                rec['task_id'] = task_id
                recs.append(rec)

100%|██████████| 4806/4806 [00:17<00:00, 268.06it/s]


In [224]:
tmp_df = pd.DataFrame(recs)

In [225]:
tmp_df = tmp_df.drop_duplicates('task_id')

In [226]:
len(set(tmp_df['query'])) / len(df_query)

0.5740740740740741

In [227]:
len(tmp_df) / len(set(tmp_df['query']))

11.439289597680318

In [228]:

attr_val_set_to_cell_range = {}

workbook = xlsxwriter.Workbook('../query_attr_extract_label/offshore_excel/allstratified_sample_4806_query_25l2subset_offshoreexcel_20230119.xlsx')
worksheet = workbook.add_worksheet('attribute_val')
worksheet2 = workbook.add_worksheet('attribute_definition')

r = 0
for i in set(tmp_df['attribute_value']):
    c = 0
    start_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    for j in eval(i):
        worksheet2.write(r, c, j)
        end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
        c += 1
    attr_val_set_to_cell_range[i] = (start_cell, end_cell)
    r += 1

cols = ['task_id', 'query', 
    'max_multi_select', 'entry_mode', 'attribute_field', 'attribute_value']

r = 0
for c in range(len(cols)):
    worksheet.write(r, c, cols[c])

r += 1
for i in tqdm(tmp_df.to_dict('records')):
    for c in range(len(cols)):
        if cols[c] != 'attribute_value':
            worksheet.write(r, c, i[cols[c]])
        else:
            start_cell, end_cell = attr_val_set_to_cell_range[i[cols[c]]]
            worksheet.data_validation(r, c, r, c, {
                'validate': 'list',
                'source': f'=attribute_definition!{start_cell}:{end_cell}',
                'input_message': i['attribute_description']
            })
    r += 1
workbook.close()

100%|██████████| 31561/31561 [00:02<00:00, 13919.07it/s]
