In [2]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [3]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    # pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        # repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [4]:
len(df_original)

62142

In [5]:
df_original = df_original.dropna(subset=['label_ordering'])

In [6]:
len(df_original)

62137

In [7]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)

# get attributes

In [8]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [9]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [10]:
attributes = set(df_attributes_group['attribute_field'])

In [11]:
category_paths = set(df_attributes['category'])

In [12]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [13]:
df1 = pd.read_csv('appen/output_batch_correct_v3/product_attribution_till_030923_valid_units.csv')
df2 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_invalid.csv')
df3 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_valid-empty units.csv')
df4 = pd.read_csv('appen/output_batch_correct_v4/Product Attribution_03.10.23_Batch2_highimpression.csv')
df5 = pd.read_csv('appen/output_batch_correct_v4/product_attribution_03.03.23_invalid_units_03.02.23_Rework.csv')

In [14]:
len(df1), len(df2), len(df3), len(df4), len(df5)

(38076, 866, 19678, 10808, 419)

In [15]:
df = pd.concat([df1, df2, df3, df4, df5])

In [16]:
df_uniq = df.drop_duplicates('label_ordering', keep='first')

In [17]:
len(df), len(df_uniq)

(69847, 54314)

In [18]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title']].rename(columns={'title': 'title_original2'}), on='label_ordering', how='left')

In [19]:
len(df_uniq_merge)

54314

In [20]:
df_uniq_merge[df_uniq_merge.title_original2.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output,title_original2


In [21]:
df_uniq_merge[df_uniq_merge['title'].apply(lambda x: x.strip()) != df_uniq_merge['title_original2'].apply(lambda x: x.strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output,title_original2
46846,162392,text_and_img,5cdd05b52e33a70fe773d613,Lixada Breathable Fishing Life Vest 209lb Bu...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing > Fishing Apparel > Fishing V...,,,,True,,,,,​Lixada Breathable Fishing Life Vest 209lb B...


In [22]:
df_uniq_merge.loc[df_uniq_merge.title_original.isna(), 'title_original'] = df_uniq_merge.loc[df_uniq_merge.title_original.isna(), 'title_original2']
del df_uniq_merge['title_original2']

In [23]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [25]:
correct = []
errors = []
errors_fixed = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
for i in errors:
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    corrected_res = []
    for j in res:
        if j.split(' > ')[-2] in attributes:
            corrected_res.append(j)
    i['final_output_corrected'] = '\n'.join(corrected_res)
    errors_fixed.append(i)

len(df), len(correct), len(errors), len(errors_fixed), len(empty), len(nonempty), len(correct) + len(errors)

(69847, 51025, 3289, 3289, 2771, 48254, 54314)

In [63]:
len(empty) / len(correct), len(nonempty) / len(correct), len(errors) / len(df)

(0.05430671239588437, 0.9456932876041156, 0.047088636591406934)

In [26]:
df_correct = pd.DataFrame(correct)
df_error = pd.DataFrame(errors)
df_error_fixed = pd.DataFrame(errors_fixed)

In [27]:
df_nonempty = pd.DataFrame(nonempty)
df_empty = pd.DataFrame(empty)

In [28]:
assert len(df_correct) == len(df_empty) + len(df_nonempty)

In [29]:
df_empty.rater3_no_attributes.all()

True

In [30]:
len(df_empty), len(df_nonempty), len(df_error)

(2771, 48254, 3289)

In [1]:
2771 / 54314

0.05101815369886217

In [72]:
df_nonempty.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_valid_units.csv', index=False)
df_empty.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_empty_units.csv', index=False)
df_error.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_invalid_units.csv', index=False)

In [31]:
df_error_fixed.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_invalid_units_fixed.csv', index=False)

In [86]:
df_empty.sample(1).to_dict()

{'label_ordering': {2398: 20173},
 'sample_method': {2398: 'only_text'},
 'product_id': {2398: '60d8185730aeaa187b11d195'},
 'title': {2398: 'Charles A. Lindbergh /N(1902-1974). American Aviator. Front Page Of The New York Times, 2 March 1932, Featuring An Article About The Kidnapping Of The Lindbergh Baby. Poster Print by Granger Collection - Item # VARGRC0051801'},
 'product_description': {2398: 'Charles A. Lindbergh /N(1902-1974). American Aviator. Front Page Of The New York Times, 2 March 1932, Featuring An Article About The Kidnapping Of The Lindbergh Baby. Poster Print by Granger Collection - Item # VARGRC0051801 would make the perfect addition to your home or office or gift recipient.  This Poster Print is ready for hanging or framing and ships in an oversized tube for maximum protection.'},
 'main_image_url': {2398: nan},
 'product_category': {2398: 'Home & Garden > Home Decor > Painting & Calligraphy'},
 'l2_category': {2398: 'Home & Garden > Home Decor'},
 'rater1_output': {2

In [83]:
df_nonempty.sample(1).to_dict()

{'label_ordering': {2963: 3912},
 'sample_method': {2963: 'text_and_img'},
 'product_id': {2963: '6122372339ba3364a8d1ec9b'},
 'title': {2963: "Fashion Men's 925 Silver Classic Cubic Zircon Wide Ring Fashion Jewelry Gifts (Size: US Size 4-11)"},
 'product_description': {2963: "100% new and high quality\nStyle: Classic Cubic Zircon Wide Ring\nMaterial: 925 Silver, AAA Zircon\nColor: Silver\nSize: US Size 4-11\n\nPackage included:\n1 x Men's Ring"},
 'main_image_url': {2963: 'https://canary.contestimg.wish.com/api/webimage/6122372339ba3364a8d1ec9b-0-large.jpg'},
 'product_category': {2963: 'Jewelry & Accessories > Rings'},
 'l2_category': {2963: 'Jewelry & Accessories > Rings'},
 'rater1_output': {2963: 'Jewelry & Accessories > Rings > Materials > Silver\nJewelry & Accessories > Rings > Primary Color > Silver\nJewelry & Accessories > Rings > Size > 4\nJewelry & Accessories > Rings > Size > 5\nJewelry & Accessories > Rings > Size > 6\nJewelry & Accessories > Rings > Size > 7\nJewelry & Ac

In [87]:
df_error.sample(1).to_dict()

{'label_ordering': {356: 2761},
 'sample_method': {356: 'only_text'},
 'product_id': {356: '60b5ecd0b0d7864a7ee1fd97'},
 'title': {356: '2021 New 8D HiFi Bluetooth 5.0 CVC8.0 Noise Reduction Stereo Wireless TWS Bluetooth Headset LED Display Headset Waterproof Dual Headphones with Power Bank Charging Case'},
 'product_description': {356: 'Feature describe:\n1.LED digital power display. Add a new power display screen, charging cabin, headphone electricity at a glance, real-time perception of electricity.\n2.Automatic pairing.Long by two headphones main buttons at the same time, until the headphones red and blue lights flashing at the same time and then loosen the button, Quickly click on the left\nEarphone twice, Two ear opportunity to connect automatically. When one of the Headphones stops blinking, the pairing is complete.\n3.Humanized operation design, according to ergonomics, adopts semi-in-ear design, three-point support is more suitable for the auricle, making the ear more comforta

In [32]:
df_error_fixed.sample(1).to_dict()

{'label_ordering': {1820: 16134},
 'sample_method': {1820: 'only_text'},
 'product_id': {1820: '61510cfd895dbcc89ad70773'},
 'title': {1820: '12pcs/set Christmas Tree Wooden Pendant Snowflake Angel Heart Shape Hanging Ornaments for Party Decorations Xmas Gifts'},
 'product_description': {1820: "Quantity:12pcs/set\nMaterial:wooden\nSize:as the picture show\nNotice:\n1.Due to the light and screen difference, the item's color may be slightly different from the pictures.\n2.Please allow slight differences due to manual measurement."},
 'main_image_url': {1820: nan},
 'product_category': {1820: 'Home & Garden > Festive & Party Supplies > Christmas > Pendant & Drop Ornaments'},
 'l2_category': {1820: 'Home & Garden > Festive & Party Supplies'},
 'rater1_output': {1820: 'Home & Garden > Festive & Party Supplies > Christmas > Ball Ornaments'},
 'rater2_output': {1820: 'Home & Garden > Festive & Party Supplies > Christmas > Pendant & Drop Ornaments > Alpha Size > One Size\nHome & Garden > Festi

In [78]:
leftover = set(df_original.label_ordering) - set(df_uniq_merge.label_ordering)
df_leftover = df_original[df_original.label_ordering.apply(lambda x: x in leftover)]

In [81]:
len(df_leftover)

7823

In [80]:
df_leftover.to_csv('appen/output_batch_correct_v5/product_attribution_missed_032423.csv', index=False)