In [1]:
import pandas as pd
import json
import hashlib
from tqdm import tqdm
import xlsxwriter
from xlsxwriter.utility import xl_rowcol_to_cell
import dvc.api

In [2]:
from collections import OrderedDict
from copy import deepcopy 

def sortOD(od):
    res = OrderedDict()
    for k, v in sorted(od.items()):
        if isinstance(v, dict):
            res[k] = sortOD(v)
        else:
            res[k] = deepcopy(v)
    return res

In [3]:
l2s = []
with open('2023_q1_top_25_l2s.txt', 'r') as f:
    for l in f:
        if len(l.replace('\n', '').strip()) > 0:
            l2s.append(l.replace('\n', '').strip())

In [4]:
df_meta = pd.read_csv("Initial Attribute Definition for First Release - UPDATED SHEET .csv")

In [5]:
df_meta_25l2s = df_meta[df_meta.category.apply(lambda x: any([x.startswith(i) for i in l2s]))]

In [6]:
len(set(df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))))

25

In [7]:
df_meta_25l2s['L2'] = df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_25l2s['L2'] = df_meta_25l2s.category.apply(lambda x: ' > '.join(x.split(' > ')[:2]))


In [8]:
df_meta_25l2s_nofreetext = df_meta_25l2s[df_meta_25l2s['entry mode'] != 'free_text']

In [9]:
meta_dict = {}
for i in set(df_meta.category):
    meta_dict[i] = df_meta[(df_meta.category == i) & (df_meta['entry mode'] != 'free_text')].sort_values(
        "attribute_field"
    ).to_dict('records')

In [10]:
df_query = pd.read_csv(dvc.api.get_url(
    'modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch1.csv',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
))

In [11]:
df_product = pd.read_csv(dvc.api.get_url(
    'datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
))

In [12]:
df_query_appen = pd.read_csv(dvc.api.get_url(
    'datasets/data/query_attr_extract_label/appen/output_batch_correct_v2/feedback/query_attribution_02.03.23_valid_units_02.21.23.csv',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
))

df_product_appen = pd.read_csv(dvc.api.get_url(
    'datasets/data/wish_attr_extract_label/appen/output_batch_correct_v2/feedback/product_attribution_02.03.23_valid_units_02.21.23.csv',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
))

In [20]:
product_label_orderings = set(df_product_appen.sort_values('Label_Ordering').tail(100)['Label_Ordering'])
query_label_orderings = set(df_query_appen.sort_values('Label_Ordering').tail(100)['Label_Ordering'])

In [22]:
df_query = df_query[df_query.label_ordering.apply(lambda x: int(x) in query_label_orderings)]

In [26]:
def tmp(x):
    try:
        return int(x) in product_label_orderings
    except:
        return False
df_product = df_product[df_product.label_ordering.apply(tmp)]

In [27]:
len(df_query), len(df_product)

(100, 100)

# product

In [28]:
recs = []
for i in tqdm(df_product.to_dict('records')):
    if i['category_path'] in meta_dict:
        for j in meta_dict[i['category_path']]:
            rec = sortOD({
                "category_path": i['category_path'],
                "product_id": i['product_id'],
                "title": i['title'],
                "product_description": i['product_description'],
                "main_image_url": i['main_image_url'],
                "attribute_field": j['attribute_field'],
                "attribute_value": j['category_attributevalue'],
                "entry_mode": j['entry mode'],
                "max_multi_select": str(j['max_multi_select']),
                "attribute_description": j['description']
            })
            task_json = json.dumps(rec).encode('utf-8')
            task_hash = hashlib.md5(task_json).hexdigest()
            task_id = f"product_attribution_{task_hash}"
            rec['task_id'] = task_id
            recs.append(rec)

100%|██████████| 100/100 [00:00<00:00, 1601.86it/s]


In [29]:
tmp_df = pd.DataFrame(recs)

In [30]:
assert len(tmp_df) == len(set(tmp_df.task_id))

In [31]:
len(set(tmp_df['product_id'])) / len(df_product)

1.0

In [32]:
len(tmp_df) / len(set(tmp_df['product_id']))

10.57

In [33]:
attr_val_set_to_cell_range = {}

workbook = xlsxwriter.Workbook('offshore_review_appen/product_attribute_extraction_offshore_batch2.xlsx')
worksheet = workbook.add_worksheet('attribute_val')
worksheet2 = workbook.add_worksheet('attribute_definition')

r = 0
for i in set(tmp_df['attribute_value']):
    c = 0
    start_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    for j in eval(i):
        worksheet2.write(r, c, j)
        end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
        c += 1
    attr_val_set_to_cell_range[i] = (start_cell, end_cell)
    r += 1


cols = ['task_id', 'category_path', 'title', 'product_description', 'main_image_url', 
    'max_multi_select', 'entry_mode', 'attribute_field', 'attribute_value']

r = 0
for c in range(len(cols)):
    worksheet.write(r, c, cols[c])

worksheet.write(r, len(cols), 'custom_value')
worksheet.write(r, len(cols) + 1, 'comments')

r += 1
for i in tqdm(tmp_df.to_dict('records')):
    for c in range(len(cols)):
        if cols[c] != 'attribute_value':
            worksheet.write(r, c, str(i[cols[c]]))
        else:
            start_cell, end_cell = attr_val_set_to_cell_range[i[cols[c]]]
            worksheet.data_validation(r, c, r, c, {
                'validate': 'list',
                'source': f'=attribute_definition!{start_cell}:{end_cell}',
                'input_message': i['attribute_description']
            })
    r += 1

workbook.close()

100%|██████████| 1057/1057 [00:00<00:00, 9990.20it/s] 


# query

In [34]:
recs = []
for i in tqdm(df_query.to_dict('records')):
    path_i_tup = tuple(i['top_query_classification_taxonomy'].split(' > '))
    for path in meta_dict:
        if tuple(path.split(' > '))[:len(path_i_tup)] == path_i_tup:
            for j in meta_dict[path]:
                rec = sortOD({
                    "category_path": i['top_query_classification_taxonomy'],
                    "query": i['query'],
                    "attribute_field": j['attribute_field'],
                    "attribute_value": j['category_attributevalue'],
                    "entry_mode": j['entry mode'],
                    "max_multi_select": str(j['max_multi_select']),
                    "attribute_description": j['description']
                })
                task_json = json.dumps(rec).encode('utf-8')
                task_hash = hashlib.md5(task_json).hexdigest()
                task_id = f"query_attribution_{task_hash}"
                rec['task_id'] = task_id
                recs.append(rec)

100%|██████████| 100/100 [00:00<00:00, 288.71it/s]


In [35]:
tmp_df = pd.DataFrame(recs)

In [36]:
tmp_df = tmp_df.drop_duplicates('task_id')

In [37]:
len(set(tmp_df['query'])) / len(df_query)

1.0

In [38]:
len(tmp_df) / len(set(tmp_df['query']))

11.43

In [39]:

attr_val_set_to_cell_range = {}

workbook = xlsxwriter.Workbook('offshore_review_appen/query_attribute_extraction_offshore_batch2.xlsx')
worksheet = workbook.add_worksheet('attribute_val')
worksheet2 = workbook.add_worksheet('attribute_definition')

r = 0
for i in set(tmp_df['attribute_value']):
    c = 0
    start_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
    for j in eval(i):
        worksheet2.write(r, c, j)
        end_cell = xl_rowcol_to_cell(r, c, row_abs=True, col_abs=True)
        c += 1
    attr_val_set_to_cell_range[i] = (start_cell, end_cell)
    r += 1

cols = ['task_id', 'category_path', 'query', 
    'max_multi_select', 'entry_mode', 'attribute_field', 'attribute_value']

r = 0
for c in range(len(cols)):
    worksheet.write(r, c, cols[c])

worksheet.write(r, len(cols), 'custom_value')
worksheet.write(r, len(cols) + 1, 'comments')

r += 1
for i in tqdm(tmp_df.to_dict('records')):
    for c in range(len(cols)):
        if cols[c] != 'attribute_value':
            worksheet.write(r, c, i[cols[c]])
        else:
            start_cell, end_cell = attr_val_set_to_cell_range[i[cols[c]]]
            worksheet.data_validation(r, c, r, c, {
                'validate': 'list',
                'source': f'=attribute_definition!{start_cell}:{end_cell}',
                'input_message': i['attribute_description']
            })
    r += 1
workbook.close()

100%|██████████| 1143/1143 [00:00<00:00, 14975.87it/s]
