In [6]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [7]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')),
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch3.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')),
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch4.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [8]:
len(df_original)

241287

In [9]:
df_original = df_original.dropna(subset=['label_ordering'])

In [10]:
len(df_original)

241282

In [11]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)

# get attributes

In [12]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [13]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [14]:
attributes = set(df_attributes_group['attribute_field'])

In [15]:
category_paths = set(df_attributes['category'])

In [16]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [17]:
df0 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_2746units.csv')
df1 = pd.read_csv('appen/output_batch_correct_v5/product_attribution_till_032423_valid_units.csv')
df2 = pd.read_csv('appen/output_batch_correct_v6/product_attribution_just_032723_valid_units.csv')
df3 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.2723_empty_missed_rework_6773_valid.csv')
df4 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_72units.csv')
df5 = pd.concat([
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.30.23_Batch3A_valid.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.30.23_Batch3B_valid.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_Batch4A.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_Batch4B.csv'),
])
df6 = pd.read_csv('appen/output_batch_correct_v7/product_attribution_missed_032423_just_032723_invalid_units_fixed.csv')
df7 = pd.read_csv('appen/output_batch_correct_v6/product_attribution_just_032723_invalid_units_fixed.csv')
df8 = pd.read_csv('appen/output_batch_correct_v5/product_attribution_till_032423_invalid_units_fixed.csv')

In [18]:
df0['file_name'] = 'Product Att_03.31.23_2746units.csv'
df1['file_name'] = 'product_attribution_till_032423_valid_units.csv'
df2['file_name'] = 'product_attribution_just_032723_valid_units.csv'
df3['file_name'] = 'Product Att_03.2723_empty_missed_rework_6773_valid.csv'
df4['file_name'] = 'Product Att_03.31.23_72units.csv'
df5['file_name'] = 'Product Att_03.30.23_Batch3A_valid.csv|Product Att_03.30.23_Batch3B_valid.csv|Product Att_03.31.23_Batch4A.csv|Product Att_03.31.23_Batch4B.csv'
df6['file_name'] = 'product_attribution_missed_032423_just_032723_invalid_units_fixed.csv'
df7['file_name'] = 'product_attribution_just_032723_invalid_units_fixed.csv'
df8['file_name'] = 'product_attribution_till_032423_invalid_units_fixed.csv'

In [19]:
len(df0), len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7), len(df8)

(2746, 48254, 43525, 6773, 72, 127349, 7789, 912, 3289)

In [20]:
df = pd.concat([df0, df1, df2, df3, df4, df5, df6, df7, df8])

In [23]:
len(df), len(set(df['label_ordering'])), len(set(df['label_ordering']))

(240709, 240673, 240673)

In [24]:
set(df['label_ordering']) - set(df_original['label_ordering'])

set()

In [25]:
len(set(df_original['label_ordering']) - set(df['label_ordering']))

609

In [26]:
df_uniq = df.drop_duplicates('label_ordering', keep='first')

In [27]:
len(df), len(df_uniq)

(240709, 240673)

In [28]:
set(df_uniq)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'product_category',
 'product_description',
 'product_description_original',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original'}

In [29]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title', 'product_description', 'main_image_url',
                                           ]].rename(
    columns={'title': 'title_original2', 'product_description': 'product_description_original2',
             'main_image_url': 'main_image_url_original2',}), on='label_ordering', how='left')

In [30]:
len(df_uniq_merge)

240673

In [31]:
df_uniq_merge[df_uniq_merge.title.apply(lambda x: str(x).strip()) != df_uniq_merge.title_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
44357,162392,text_and_img,5cdd05b52e33a70fe773d613,Lixada Breathable Fishing Life Vest 209lb Bu...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing,,,...,product_attribution_till_032423_valid_units.csv,​Lixada Breathable Fishing Life Vest 209lb B...,Sports > Fishing > Fishing Apparel > Fishing V...,,,,,​Lixada Breathable Fishing Life Vest 209lb B...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...
110774,118572,only_text,60653b38ddc105a55285a859,American Europe Women's Fashion Jewelry Antiqu...,Stone: Moonstone\nColor: silver\nSize (US): 5 ...,,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,​​American Europe Women's Fashion Jewelry Anti...,Stone: Moonstone\nColor: silver\nSize (US): 5 ...,
111146,119230,text_and_img,610105dfe1fcdddd74dffd8d,Apple AirPods with Wireless Charging Case MRXJ...,Feature:\nLatest and best AirPod\n\nLighting c...,https://canary.contestimg.wish.com/api/webimag...,Consumer Electronics > Earphones & Headphones ...,Consumer Electronics > Earphones & Headphones,True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Apple AirPods with Wireless Charging Case ‎MRX...,Feature:\nLatest and best AirPod\n\nLighting c...,https://canary.contestimg.wish.com/api/webimag...
111559,119970,text_and_img,60ea51ce0d9cc11e19e1be19,Blue Wolf Dream Catcher Chest Pack Unisex Slin...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,https://canary.contestimg.wish.com/api/webimag...,Luggage & Bags > Men's Bags > Backpacks,Luggage & Bags > Men's Bags,True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Blue Wolf Dream Catcher Chest Pack Unisex Slin...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,https://canary.contestimg.wish.com/api/webimag...
113512,123837,only_text,60fe204548a7aecbeb1c2231,7x Leather Craft Half-Round Strap Belt End Pun...,"Condition: New: A brand-new, unused, unopened ...",,"Home & Garden > Arts, Crafts & Sewing > Leathe...","Home & Garden > Arts, Crafts & Sewing",True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,​ 7x Leather Craft Half-Round Strap Belt End P...,"Condition: New: A brand-new, unused, unopened ...",
117489,130728,only_text,60e94199fa08d2604699c7a3,Hexagram Chest Pack Unisex Sling Backpack Casu...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,,Luggage & Bags > Men's Bags > Backpacks,Luggage & Bags > Men's Bags,,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Hexagram Chest Pack Unisex Sling Backpack Casu...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,
120288,135885,only_text,60fbe4f2bcc4b88c38565afd,Fear Nothing' Hip Hop Men's Fashion 925 Sterli...,"Hello there! Welcome to my store, if you like ...",,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,'Fear Nothing' Hip Hop Men's Fashion 925 Sterl...,"Hello there! Welcome to my store, if you like ...",
123396,141286,text_and_img,607042d6a68cb218c3bcae06,Freedom or Death' Mens Punk Gothic Skull Ring,Welcome to Our Store!\nOur Products are 100% B...,https://canary.contestimg.wish.com/api/webimag...,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,'Freedom or Death' Mens Punk Gothic Skull Ring,Welcome to Our Store!\nOur Products are 100% B...,https://canary.contestimg.wish.com/api/webimag...
124388,143016,text_and_img,611a75c952e8fa7c477130bf,Roses Are Red Doritos Are Savory The Us zfd Ph...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,https://canary.contestimg.wish.com/api/webimag...,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Roses Are Red Doritos Are Savory The Us zfd Ph...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,https://canary.contestimg.wish.com/api/webimag...
126035,145650,only_text,6118adac154b7afd9fc2709a,Thin Red Line Union Jack Uk Flag ysf Phone Cas...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Thin Red Line Union Jack Uk Flag ysf Phone Cas...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,


In [32]:
df_uniq_merge[df_uniq_merge.product_description.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.product_description_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
18485,21825,text_and_img,60a3c6ebdf61554e88b12b99,Hot 1pcs Fishing Lures 6.5cm/12g Topwater Popp...,3D eyes make it perfect tool for fishing lover...,https://canary.contestimg.wish.com/api/webimag...,Sports > Fishing > Fishing Lures,Sports > Fishing,,,...,product_attribution_till_032423_valid_units.csv,Hot 1pcs Fishing Lures 6.5cm/12g Topwater Popp...,Sports > Fishing > Fishing Lures > Primary Col...,,,,,Hot 1pcs Fishing Lures 6.5cm/12g Topwater Popp...,​3D eyes make it perfect tool for fishing love...,https://canary.contestimg.wish.com/api/webimag...
20468,24788,text_and_img,6166bc02a4a357b4c4838251,A Little Pumpkin Is On The Way Baby Shower: Cu...,A Little Pumpkin Is On The Way Baby Shower: Cu...,https://canary.contestimg.wish.com/api/webimag...,Home & Garden > Festive & Party Supplies > Eve...,Home & Garden > Festive & Party Supplies,,,...,product_attribution_till_032423_valid_units.csv,A Little Pumpkin Is On The Way Baby Shower: Cu...,Home & Garden > Festive & Party Supplies > Eve...,,,,,A Little Pumpkin Is On The Way Baby Shower: Cu...,\nA Little Pumpkin Is On The Way Baby Shower: ...,https://canary.contestimg.wish.com/api/webimag...
20862,25370,only_text,6103789f2746540088c592d5,"Dream Color LED Corner Floor Lamp,Bluetooth AP...",【Immersive Ambient Lighting】Dream Color color-...,,Home Improvement > Lights & Lighting > Lamps &...,Home Improvement > Lights & Lighting,,,...,product_attribution_till_032423_valid_units.csv,"Dream Color LED Corner Floor Lamp,Bluetooth AP...",Home Improvement > Lights & Lighting > Lamps &...,,,,,"Dream Color LED Corner Floor Lamp,Bluetooth AP...",【Immersive Ambient Lighting】Dream Color color-...,
20888,25412,text_and_img,61ac17c6bd3328999ac83f50,Alantyer Austrian Crystal Teardrop Women Neckl...,Brand: Alantyer \n MPN: alantyer \n \nAlantyer...,https://canary.contestimg.wish.com/api/webimag...,Jewelry & Accessories > Fine Jewelry > Necklaces,Jewelry & Accessories > Fine Jewelry,True,,...,product_attribution_till_032423_valid_units.csv,Alantyer Austrian Crystal Teardrop Women Neckl...,\nJewelry & Accessories > Fine Jewelry > Neckl...,,,,,Alantyer Austrian Crystal Teardrop Women Neckl...,Brand: Alantyer \n MPN: alantyer \n \nAlantyer...,https://canary.contestimg.wish.com/api/webimag...
22049,27152,only_text,612fe0b0372f994504721c5d,"American Flag String Lights,6.5ft×3.3ft Waterp...","American Flag String Lights,6.5ft×3.3ft Waterp...",,Home & Garden > Garden Supplies > Yard & Garde...,Home & Garden > Garden Supplies,,,...,product_attribution_till_032423_valid_units.csv,"American Flag String Lights,6.5ft×3.3ft Waterp...",Home & Garden > Garden Supplies > Yard & Garde...,,,,,"American Flag String Lights,6.5ft×3.3ft Waterp...","\nAmerican Flag String Lights,6.5ft×3.3ft Wat...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224044,250982,text_and_img,600bdb6a71ed47f3a9cecefa,Michael Jordan Last Shot For iPhone Case 12 11...,#NAME?,https://canary.contestimg.wish.com/api/webimag...,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,True,,...,Product Att_03.30.23_Batch3A_valid.csv|Product...,,,,,,,Michael Jordan Last Shot For iPhone Case 12 11...,- Material: Made from high-quality plate and r...,https://canary.contestimg.wish.com/api/webimag...
231805,31397,text_and_img,61a35051109014bc639e96b6,3D Flower Silicone Mold Sugar Paste DIY Fondan...,#NAME?,https://canary.contestimg.wish.com/api/webimag...,,,TRUE,0,...,product_attribution_missed_032423_just_032723_...,3D Flower Silicone Mold Sugar Paste DIY Fondan...,0\n0\n0,- Brand Name: None\n- Origin: CN(Origin)\n- Mo...,"Home & Garden > Kitchen,Dining & Bar > Bakewar...",0 has no attribute name,,3D Flower Silicone Mold Sugar Paste DIY Fondan...,- Brand Name: None\n- Origin: CN(Origin)\n- Mo...,https://canary.contestimg.wish.com/api/webimag...
238556,12731,only_text,617acc66258133f181049a30,ZUCKEO 10W Low Voltage Landscape Lights LED La...,"body {\n font-family: ""Source Sans Pro"", sans...",,Home Improvement > Lights & Lighting > Outdoor...,Home Improvement > Lights & Lighting,True,,...,product_attribution_till_032423_invalid_units_...,ZUCKEO 10W Low Voltage Landscape Lights LED La...,\nHome Improvement > Lights & Lighting > Outdo...,,,Home Improvement > Lights & Lighting > Outdoor...,Home Improvement > Lights & Lighting > Outdoor...,ZUCKEO 10W Low Voltage Landscape Lights LED La...,"body {\n font-family: ""Source Sans Pro"", sans...",
240231,175412,text_and_img,62cfc691a029a596a5a292dd,Modern RGB Corner Floor Lamp LED Color Changin...,Features:\n【Dimmable Floor Lights】RGB Corner F...,https://canary.contestimg.wish.com/api/webimag...,Home Improvement > Lights & Lighting > Lamps &...,Home Improvement > Lights & Lighting,,,...,product_attribution_till_032423_invalid_units_...,Modern RGB Corner Floor Lamp LED Color Changin...,Home Improvement > Lights & Lighting > Lamps &...,,,Home Improvement > Lights & Lighting > Lamps &...,Home Improvement > Lights & Lighting > Lamps &...,Modern RGB Corner Floor Lamp LED Color Changin...,\nFeatures:\n【Dimmable Floor Lights】RGB Corner...,https://canary.contestimg.wish.com/api/webimag...


In [33]:
df_uniq_merge[df_uniq_merge.main_image_url.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.main_image_url_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [34]:
df_uniq_merge[df_uniq_merge.main_image_url.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.main_image_url_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [35]:
df_uniq_merge['title'] = df_uniq_merge['title_original2']
df_uniq_merge['product_description'] = df_uniq_merge['product_description_original2']

In [36]:
df_uniq_merge[df_uniq_merge.title.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [37]:
df_uniq_merge[df_uniq_merge.product_description.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_unlisted_value,...,file_name,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
232727,35818,text_and_img,6120aa90d631eeaff38fd328,3 Style New Concept Training Hidden Happiness Cup,,https://canary.contestimg.wish.com/api/webimag...,,,True,0,...,product_attribution_missed_032423_just_032723_...,3 Style New Concept Training Hidden Happiness Cup,0\n0\n0,,"Home & Garden > Kitchen,Dining & Bar > Teaware...",0 has no attribute name,,3 Style New Concept Training Hidden Happiness Cup,,https://canary.contestimg.wish.com/api/webimag...


In [38]:
df_uniq_merge.loc[df_uniq_merge.product_description.isna(), 'product_description'] = ''

In [39]:
set(df_uniq_merge)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'main_image_url_original2',
 'product_category',
 'product_description',
 'product_description_original',
 'product_description_original2',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original',
 'title_original2'}

In [40]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [41]:
df_uniq_merge.loc[~df_uniq_merge.final_output_corrected.isna(), 'final_output'] = \
    df_uniq_merge.loc[~df_uniq_merge.final_output_corrected.isna(), 'final_output_corrected']

In [42]:
len(df_uniq_merge), len(set(df_uniq_merge.label_ordering))

(240673, 240673)

In [43]:
correct = []
errors = []
errors_fixed = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
for i in errors:
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    corrected_res = []
    for j in res:
        if len(j.split(' > ')) >= 2 and j.split(' > ')[-2] in attributes:
            corrected_res.append(j)
    i['final_output_corrected'] = '\n'.join(corrected_res)
    errors_fixed.append(i)

len(df_uniq_merge), len(correct), len(errors), len(errors_fixed), len(empty), len(nonempty), len(correct) + len(errors)

(240673, 236079, 4594, 4594, 0, 236079, 240673)

In [44]:
df_error_fixed = pd.DataFrame(errors_fixed)

In [45]:
df_error_fixed['final_output_corrected'].apply(lambda x: len(x) == 0).mean()

0.7688289072703526

In [46]:
df_error_fixed_nonempty = df_error_fixed[df_error_fixed['final_output_corrected'].apply(lambda x: len(x) > 0)]
df_error_fixed_empty = df_error_fixed[df_error_fixed['final_output_corrected'].apply(lambda x: len(x) == 0)]

In [47]:
len(df_error_fixed_empty)

3532

In [48]:
df_correct = pd.DataFrame(correct)

In [49]:
df_correct['final_output'].apply(lambda x: len(x) == 0).mean()

0.0

In [50]:
df_error_fixed_nonempty['final_output'] = df_error_fixed_nonempty['final_output_corrected']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_error_fixed_nonempty['final_output'] = df_error_fixed_nonempty['final_output_corrected']


In [51]:
df_correct = pd.concat([df_correct, df_error_fixed_nonempty])

In [52]:
len(df_correct) / len(df_original)

0.9828375096360276

In [53]:
len(df_correct), len(set(df_correct.label_ordering))

(237141, 237141)

In [54]:
df_error_fixed_empty['rater3_no_attributes'].all()

True

In [59]:
df_error_fixed_empty = df_error_fixed_empty[['label_ordering', 'sample_method', 'product_id', 'title', 'product_description', 'main_image_url', 
                      'product_category', 'l2_category', 'rater1_no_attributes',
                        'rater1_output',
                        'rater1_unlisted_value',
                        'rater2_no_attributes',
                        'rater2_output',
                        'rater2_unlisted_value',
                        'rater3_no_attributes',
                        'rater3_output',
                        'rater3_unlisted_value', 'file_name', ]]

In [60]:
df_error_fixed_empty['explanation'] = "Unreasonably empty attribute name value pairs or invalid attribute name value pairs"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_error_fixed_empty['explanation'] = "Unreasonably empty attribute name value pairs or invalid attribute name value pairs"


In [62]:
len(df_error_fixed_empty)

3532

In [64]:
df_error_fixed_empty = df_error_fixed_empty.merge(df_original[['label_ordering', 'category_path']], on='label_ordering')

In [65]:
set(df_error_fixed_empty.file_name)

{'product_attribution_just_032723_invalid_units_fixed.csv',
 'product_attribution_missed_032423_just_032723_invalid_units_fixed.csv',
 'product_attribution_till_032423_invalid_units_fixed.csv'}

In [66]:
df_error_fixed_empty.sample(2)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_no_attributes,rater1_output,rater1_unlisted_value,rater2_no_attributes,rater2_output,rater2_unlisted_value,rater3_no_attributes,rater3_output,rater3_unlisted_value,file_name,explanation,category_path
754,28525,only_text,61489a44a7749217b825d2ab,LJ09 Water Shoes Dino Pattern Quick-Dry Aqua S...,Knitted upper (polyester) + non-slip thermopla...,,,,True,0,0,True,0,0,True,0,0,product_attribution_missed_032423_just_032723_...,Unreasonably empty attribute name value pairs ...,Sports > Camping & Hiking > Upstream Shoes
2304,41223,only_text,60fa4d18c2af2ec4e75074c3,Lung Cleanse for Smokers - Clear Your Airways ...,WHOLE BODY DETOX -- We offer a 30 day money ba...,,,,True,0,0,True,0,0,True,0,0,product_attribution_missed_032423_just_032723_...,Unreasonably empty attribute name value pairs ...,"Home & Garden > Kitchen,Dining & Bar > Teaware..."


In [67]:
len(df_error_fixed_empty)

3532

In [68]:
df_error_fixed_empty.to_csv('appen/output_batch_correct_v9/product_attribution_till_040323_empty_invalid_units.csv', index=False)

In [305]:
set(df_correct)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'main_image_url_original2',
 'product_category',
 'product_description',
 'product_description_original',
 'product_description_original2',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original',
 'title_original2'}

In [306]:
df_correct = df_correct[['label_ordering', 'sample_method', 'product_id', 'title', 'product_description', 'main_image_url', 'final_output']]

In [307]:
len(df_correct)

237141

In [308]:
df_correct = df_correct.merge(df_original[['label_ordering', 'category_path']])

In [309]:
len(df_correct)

237141

In [310]:
df_correct.sample(2).to_dict('records')

[{'label_ordering': 81456,
  'sample_method': 'only_text',
  'product_id': '61855f902b8b86f1e2e11580',
  'title': 'Shar Pei On The Great Wall Travel Laptop Backpack,Business Anti Theft Slim Durable with USB Charging Port, College School Computer Bag Bookbag Casual Hiking Daypack for Women Men',
  'product_description': 'High quality Polyester Fabric Material & Practical luggage Strap design & Comfortable Widen Padded Shoulder Strap & Built-in Key Ring design, fits for daily use at school, college, business and travel, suitable for women, men and students.\nExternal dimensions: 17x 12 x 6.5 inch. Multiple divider pockets, easy for holding 15.6 Inches laptop, water bottle, readers and a bunch of other items, iPad, journal, pens and pencils, iPhone.\nUSB interface with built-in cable design, great convenience for charging your electronic devices via connecting your own power bank. And the headphone interface frees your hands when enjoying audios, music, etc.\nMulti-panel Airflow system pr

In [311]:
df_correct.to_csv('appen/output_batch_correct_v9/product_attribution_till_040323_valid_units.csv', index=False)

# accounting for missing units

In [3]:
df_accounting = pd.read_csv('appen/output_batch_correct_v8/APPEN_WISH - Product Attribution.csv')

In [313]:
set(df_accounting['label_ordering']) == set(df_original['label_ordering'])

True

In [314]:
df_delivered = pd.concat([df_correct, df_error_fixed_empty])[['label_ordering']]

In [315]:
df_delivered['received'] = True

In [316]:
df_original_merge = df_original.merge(df_delivered, on='label_ordering', how='left')

In [317]:
df_original_merge.loc[df_original_merge.received.isna(), 'received'] = False

In [318]:
df_missed = df_original_merge[df_original_merge.received == False]

In [319]:
df_accounting.merge(df_missed, on='label_ordering').to_csv('appen/output_batch_correct_v9/product_attribution_till_040323_missed_units.csv', 
                                                        index=False)

In [320]:
len(pd.read_csv('appen/output_batch_correct_v9/product_attribution_till_040323_missed_units.csv'))

609