In [1]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [2]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')),
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch3.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')),
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch4.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [3]:
len(df_original)

241287

In [4]:
df_original = df_original.dropna(subset=['label_ordering'])

In [5]:
len(df_original)

241282

In [6]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)

# get attributes

In [7]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [8]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [9]:
attributes = set(df_attributes_group['attribute_field'])

In [10]:
category_paths = set(df_attributes['category'])

In [11]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [12]:
dflast = pd.read_csv('appen/output_batch_correct_v9/product_attribution_3.5k_04_06.csv_rework.csv')
df0 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_2746units.csv')
df1 = pd.read_csv('appen/output_batch_correct_v5/product_attribution_till_032423_valid_units.csv')
df2 = pd.read_csv('appen/output_batch_correct_v6/product_attribution_just_032723_valid_units.csv')
df3 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.2723_empty_missed_rework_6773_valid.csv')
df4 = pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_72units.csv')
df5 = pd.concat([
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.30.23_Batch3A_valid.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.30.23_Batch3B_valid.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_Batch4A.csv'),
    pd.read_csv('appen/output_batch_correct_v8/Product Att_03.31.23_Batch4B.csv'),
])
df6 = pd.read_csv('appen/output_batch_correct_v7/product_attribution_missed_032423_just_032723_invalid_units_fixed.csv')
df7 = pd.read_csv('appen/output_batch_correct_v6/product_attribution_just_032723_invalid_units_fixed.csv')
df8 = pd.read_csv('appen/output_batch_correct_v5/product_attribution_till_032423_invalid_units_fixed.csv')

In [13]:
dflast['file_name'] = "product_attribution_3.5k_04_06.csv_rework.csv"
df0['file_name'] = 'Product Att_03.31.23_2746units.csv'
df1['file_name'] = 'product_attribution_till_032423_valid_units.csv'
df2['file_name'] = 'product_attribution_just_032723_valid_units.csv'
df3['file_name'] = 'Product Att_03.2723_empty_missed_rework_6773_valid.csv'
df4['file_name'] = 'Product Att_03.31.23_72units.csv'
df5['file_name'] = 'Product Att_03.30.23_Batch3A_valid.csv|Product Att_03.30.23_Batch3B_valid.csv|Product Att_03.31.23_Batch4A.csv|Product Att_03.31.23_Batch4B.csv'
df6['file_name'] = 'product_attribution_missed_032423_just_032723_invalid_units_fixed.csv'
df7['file_name'] = 'product_attribution_just_032723_invalid_units_fixed.csv'
df8['file_name'] = 'product_attribution_till_032423_invalid_units_fixed.csv'

In [14]:
len(dflast), len(df0), len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7), len(df8)

(3517, 2746, 48254, 43525, 6773, 72, 127349, 7789, 912, 3289)

In [40]:
dflast = dflast.rename(columns={'rater_output': 'rater3_output'})

In [41]:
df = pd.concat([dflast, df0, df1, df2, df3, df4, df5, df6, df7, df8])

In [42]:
len(df), len(set(df['label_ordering'])), len(set(df['label_ordering']))

(244226, 240673, 240673)

In [43]:
set(df['label_ordering']) - set(df_original['label_ordering'])

set()

In [44]:
len(set(df_original['label_ordering']) - set(df['label_ordering']))

609

In [45]:
df_uniq = df.drop_duplicates('label_ordering', keep='first')

In [46]:
len(df), len(df_uniq)

(244226, 240673)

In [47]:
set(df_uniq)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'no_attributes',
 'product_category',
 'product_description',
 'product_description_original',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original',
 'unlisted_value'}

In [48]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title', 'product_description', 'main_image_url',
                                           ]].rename(
    columns={'title': 'title_original2', 'product_description': 'product_description_original2',
             'main_image_url': 'main_image_url_original2',}), on='label_ordering', how='left')

In [49]:
len(df_uniq_merge)

240673

In [50]:
df_uniq_merge[df_uniq_merge.title.apply(lambda x: str(x).strip()) != df_uniq_merge.title_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
47874,162392,text_and_img,5cdd05b52e33a70fe773d613,Lixada Breathable Fishing Life Vest 209lb Bu...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing,,,...,,​Lixada Breathable Fishing Life Vest 209lb B...,Sports > Fishing > Fishing Apparel > Fishing V...,,,,,​Lixada Breathable Fishing Life Vest 209lb B...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...
114291,118572,only_text,60653b38ddc105a55285a859,American Europe Women's Fashion Jewelry Antiqu...,Stone: Moonstone\nColor: silver\nSize (US): 5 ...,,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,,,...,,,,,,,,​​American Europe Women's Fashion Jewelry Anti...,Stone: Moonstone\nColor: silver\nSize (US): 5 ...,
114663,119230,text_and_img,610105dfe1fcdddd74dffd8d,Apple AirPods with Wireless Charging Case MRXJ...,Feature:\nLatest and best AirPod\n\nLighting c...,https://canary.contestimg.wish.com/api/webimag...,Consumer Electronics > Earphones & Headphones ...,Consumer Electronics > Earphones & Headphones,,,...,,,,,,,,Apple AirPods with Wireless Charging Case ‎MRX...,Feature:\nLatest and best AirPod\n\nLighting c...,https://canary.contestimg.wish.com/api/webimag...
115076,119970,text_and_img,60ea51ce0d9cc11e19e1be19,Blue Wolf Dream Catcher Chest Pack Unisex Slin...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,https://canary.contestimg.wish.com/api/webimag...,Luggage & Bags > Men's Bags > Backpacks,Luggage & Bags > Men's Bags,,,...,,,,,,,,Blue Wolf Dream Catcher Chest Pack Unisex Slin...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,https://canary.contestimg.wish.com/api/webimag...
117029,123837,only_text,60fe204548a7aecbeb1c2231,7x Leather Craft Half-Round Strap Belt End Pun...,"Condition: New: A brand-new, unused, unopened ...",,"Home & Garden > Arts, Crafts & Sewing > Leathe...","Home & Garden > Arts, Crafts & Sewing",,,...,True,,,,,,,​ 7x Leather Craft Half-Round Strap Belt End P...,"Condition: New: A brand-new, unused, unopened ...",
121006,130728,only_text,60e94199fa08d2604699c7a3,Hexagram Chest Pack Unisex Sling Backpack Casu...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,,Luggage & Bags > Men's Bags > Backpacks,Luggage & Bags > Men's Bags,,,...,,,,,,,,Hexagram Chest Pack Unisex Sling Backpack Casu...,1.HIGH QUALITY: This Stylish Sling Bag Is Made...,
123805,135885,only_text,60fbe4f2bcc4b88c38565afd,Fear Nothing' Hip Hop Men's Fashion 925 Sterli...,"Hello there! Welcome to my store, if you like ...",,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,,,...,,,,,,,,'Fear Nothing' Hip Hop Men's Fashion 925 Sterl...,"Hello there! Welcome to my store, if you like ...",
126913,141286,text_and_img,607042d6a68cb218c3bcae06,Freedom or Death' Mens Punk Gothic Skull Ring,Welcome to Our Store!\nOur Products are 100% B...,https://canary.contestimg.wish.com/api/webimag...,Jewelry & Accessories > Rings,Jewelry & Accessories > Rings,,,...,,,,,,,,'Freedom or Death' Mens Punk Gothic Skull Ring,Welcome to Our Store!\nOur Products are 100% B...,https://canary.contestimg.wish.com/api/webimag...
127905,143016,text_and_img,611a75c952e8fa7c477130bf,Roses Are Red Doritos Are Savory The Us zfd Ph...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,https://canary.contestimg.wish.com/api/webimag...,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,,,...,,,,,,,,Roses Are Red Doritos Are Savory The Us zfd Ph...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,https://canary.contestimg.wish.com/api/webimag...
129552,145650,only_text,6118adac154b7afd9fc2709a,Thin Red Line Union Jack Uk Flag ysf Phone Cas...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,,,...,,,,,,,,Thin Red Line Union Jack Uk Flag ysf Phone Cas...,Phone Case for iPhone-5 5S SE-6 6S 7 8 Plus-X ...,


In [51]:
df_uniq_merge[df_uniq_merge.product_description.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.product_description_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
3313,51483,only_text,6123bb20aeda2cbd38895710,Chihuahua Dog Night Light Cute Puppy Plug-in L...,Internal high-brightness LED. Bright enough to...,,Home & Garden > Festive & Party Supplies > Eve...,Home & Garden > Festive & Party Supplies,,,...,,,,,,,,Chihuahua Dog Night Light Cute Puppy Plug-in L...,Internal high-brightness LED. Bright enough to...,
3340,59846,only_text,614b380bd929bc0c71e48e9a,Amplificatore di segnale 2G 3G 4G GSM902 Cellu...,welcome to my store!\n\n Features:\n - Full-du...,,Cellphones & Telecommunications > Mobile Phone...,Cellphones & Telecommunications > Mobile Phone...,,True,...,,,,,,,,Amplificatore di segnale 2G 3G 4G GSM902 Cellu...,welcome to my store!\r\n\r\n Features:\n - Ful...,
3342,60762,only_text,614e80e4095482726a25f9f2,LED Grow Lamp Adjustable Energy-saving Wide A...,Specifications: \nIt is the grow lamp that can...,,Home & Garden > Garden Supplies > Indoor Garde...,Home & Garden > Garden Supplies,,True,...,,,,,,,,LED Grow Lamp Adjustable Energy-saving Wide...,Specifications: \nIt is the grow lamp that can...,
3348,62151,only_text,600e79ace4f17152ae3331db,2021 NEWEST Flower Pots Plant Basket with Hand...,Feature:\n\n[Pure natural materials] It is art...,,Home & Garden > Home Storage & Organization > ...,Home & Garden > Home Storage & Organization,,,...,,,,,,,,2021 NEWEST Flower Pots Plant Basket with Hand...,Feature:\r\n\r\n[Pure natural materials] It is...,
3350,62186,only_text,619f669a6e15952d0fd2d772,138 LED Star Moon Curtain String Lights 8 Mode...,welcome to my store!\n\nTips:Pls choose the co...,,Home Improvement > Lights & Lighting > LED Lig...,Home Improvement > Lights & Lighting,,,...,,,,,,,,138 LED Star Moon Curtain String Lights 8 Mode...,welcome to my store!\r\n\r\nTips:Pls choose th...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226485,249898,text_and_img,60e3db7b81b0ff8f96a27932,Raindrops Diet Coke For iPhone Case 12 11 Pro ...,#NAME?,https://canary.contestimg.wish.com/api/webimag...,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,,,...,True,,,,,,,Raindrops Diet Coke For iPhone Case 12 11 Pro ...,- Material: Made from high-quality plate and r...,https://canary.contestimg.wish.com/api/webimag...
227561,250982,text_and_img,600bdb6a71ed47f3a9cecefa,Michael Jordan Last Shot For iPhone Case 12 11...,#NAME?,https://canary.contestimg.wish.com/api/webimag...,Cellphones & Telecommunications > Phone Bags &...,Cellphones & Telecommunications > Phone Bags &...,,,...,True,,,,,,,Michael Jordan Last Shot For iPhone Case 12 11...,- Material: Made from high-quality plate and r...,https://canary.contestimg.wish.com/api/webimag...
238606,12731,only_text,617acc66258133f181049a30,ZUCKEO 10W Low Voltage Landscape Lights LED La...,"body {\n font-family: ""Source Sans Pro"", sans...",,Home Improvement > Lights & Lighting > Outdoor...,Home Improvement > Lights & Lighting,,,...,,ZUCKEO 10W Low Voltage Landscape Lights LED La...,\nHome Improvement > Lights & Lighting > Outdo...,,,Home Improvement > Lights & Lighting > Outdoor...,Home Improvement > Lights & Lighting > Outdoor...,ZUCKEO 10W Low Voltage Landscape Lights LED La...,"body {\n font-family: ""Source Sans Pro"", sans...",
240240,175412,text_and_img,62cfc691a029a596a5a292dd,Modern RGB Corner Floor Lamp LED Color Changin...,Features:\n【Dimmable Floor Lights】RGB Corner F...,https://canary.contestimg.wish.com/api/webimag...,Home Improvement > Lights & Lighting > Lamps &...,Home Improvement > Lights & Lighting,,,...,,Modern RGB Corner Floor Lamp LED Color Changin...,Home Improvement > Lights & Lighting > Lamps &...,,,Home Improvement > Lights & Lighting > Lamps &...,Home Improvement > Lights & Lighting > Lamps &...,Modern RGB Corner Floor Lamp LED Color Changin...,\nFeatures:\n【Dimmable Floor Lights】RGB Corner...,https://canary.contestimg.wish.com/api/webimag...


In [52]:
df_uniq_merge[df_uniq_merge.main_image_url.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.main_image_url_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [53]:
df_uniq_merge[df_uniq_merge.main_image_url.apply(lambda x: str(x).strip()) != \
              df_uniq_merge.main_image_url_original2.apply(lambda x: str(x).strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [54]:
df_uniq_merge['title'] = df_uniq_merge['title_original2']
df_uniq_merge['product_description'] = df_uniq_merge['product_description_original2']

In [55]:
df_uniq_merge[df_uniq_merge.title.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [56]:
df_uniq_merge[df_uniq_merge.product_description.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
1579,35818,text_and_img,6120aa90d631eeaff38fd328,3 Style New Concept Training Hidden Happiness Cup,,https://canary.contestimg.wish.com/api/webimag...,"Home & Garden > Kitchen,Dining & Bar > Teaware...","Home & Garden > Kitchen,Dining & Bar",,,...,,,,,,,,3 Style New Concept Training Hidden Happiness Cup,,https://canary.contestimg.wish.com/api/webimag...


In [57]:
df_uniq_merge.loc[df_uniq_merge.product_description.isna(), 'product_description'] = ''

In [58]:
set(df_uniq_merge)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'main_image_url_original2',
 'no_attributes',
 'product_category',
 'product_description',
 'product_description_original',
 'product_description_original2',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original',
 'title_original2',
 'unlisted_value'}

In [59]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [60]:
df_uniq_merge.loc[~df_uniq_merge.final_output_corrected.isna(), 'final_output'] = \
    df_uniq_merge.loc[~df_uniq_merge.final_output_corrected.isna(), 'final_output_corrected']

In [61]:
len(df_uniq_merge), len(set(df_uniq_merge.label_ordering))

(240673, 240673)

In [62]:
df_uniq_merge[df_uniq_merge.final_output.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2


In [63]:
correct = []
errors = []
errors_fixed = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
for i in errors:
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    corrected_res = []
    for j in res:
        if len(j.split(' > ')) >= 2 and j.split(' > ')[-2] in attributes:
            corrected_res.append(j)
    i['final_output_corrected'] = '\n'.join(corrected_res)
    errors_fixed.append(i)

len(df_uniq_merge), len(correct), len(errors), len(errors_fixed), len(empty), len(nonempty), len(correct) + len(errors)

(240673, 239596, 1077, 1077, 0, 239596, 240673)

In [64]:
df_error_fixed = pd.DataFrame(errors_fixed)

In [65]:
df_error_fixed['final_output_corrected'].apply(lambda x: len(x) == 0).mean()

0.013927576601671309

In [66]:
df_error_fixed_nonempty = df_error_fixed[df_error_fixed['final_output_corrected'].apply(lambda x: len(x) > 0)]
df_error_fixed_empty = df_error_fixed[df_error_fixed['final_output_corrected'].apply(lambda x: len(x) == 0)]

In [67]:
len(df_error_fixed_empty)

15

In [68]:
df_error_fixed_empty

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,no_attributes,unlisted_value,...,rater3_unlisted_value,title_original,final_output,product_description_original,category_path,explanation,final_output_corrected,title_original2,product_description_original2,main_image_url_original2
1062,24297,only_text,611bb00e2522356ea1aac345,1 Set Helpful Butter Box Large Capacity Seale...,Specifications: \nThis sealed box cutting set ...,,,,,,...,0.0,1 Set Helpful Butter Box Large Capacity Seale...,0\n0\n0,Specifications: \nThis sealed box cutting set ...,"Home & Garden > Kitchen,Dining & Bar > Cheese ...",0 has no attribute name,,1 Set Helpful Butter Box Large Capacity Seale...,Specifications: \nThis sealed box cutting set ...,
1063,25950,only_text,6011c26b367389cfe64dc2c0,Honeybee Gardens Wanderlust PowderColors Stack...,undefined\n\n\nColor × Quantity-See details × ...,,,,,,...,0.0,Honeybee Gardens Wanderlust PowderColors Stack...,0\n0\n0,undefined\n\n\nColor × Quantity-See details × ...,Home & Garden > Garden Supplies > Yard & Garde...,0 has no attribute name,,Honeybee Gardens Wanderlust PowderColors Stack...,undefined\n\n\nColor × Quantity-See details × ...,
1064,34429,text_and_img,60336e08b4e87ed4251e8e65,[BEST SELLER],● Kerr Jars Mason Wide Mouth with Cap Quart 12...,https://canary.contestimg.wish.com/api/webimag...,,,,,...,0.0,[BEST SELLER],0\n0\n0,● Kerr Jars Mason Wide Mouth with Cap Quart 12...,Home & Garden > Home Storage & Organization > ...,0 has no attribute name,,[BEST SELLER],● Kerr Jars Mason Wide Mouth with Cap Quart 12...,https://canary.contestimg.wish.com/api/webimag...
1065,37053,only_text,60116692795639f1b27eec1c,AHC Natural Perfection Double Shield Enthusias...,undefined\n\n\nCapacity-1. Enthusiastic Sun St...,,,,,,...,0.0,AHC Natural Perfection Double Shield Enthusias...,0\n0\n0,undefined\n\n\nCapacity-1. Enthusiastic Sun St...,Sports > Camping & Hiking > Walking Sticks,0 has no attribute name,,AHC Natural Perfection Double Shield Enthusias...,undefined\n\n\nCapacity-1. Enthusiastic Sun St...,
1066,37147,only_text,6187a85807a1cbdb0eeb99f2,Ridgid 632-45260 Pins,Ridgid 632-45260 Pins-We do not ship to PO BOX...,,,,,,...,0.0,Ridgid 632-45260 Pins,0\n0\n0,Ridgid 632-45260 Pins-We do not ship to PO BOX...,Cellphones & Telecommunications > Mobile Phone...,0 has no attribute name,,Ridgid 632-45260 Pins,Ridgid 632-45260 Pins-We do not ship to PO BOX...,
1067,38050,only_text,60dd8710a21dd0c4c074030e,New emergency sleeping bag thermal insulation ...,"Dear customers:\nWelcome to our store, you wil...",,,,,,...,0.0,New emergency sleeping bag thermal insulation ...,0\n0\n0,"Dear customers:\nWelcome to our store, you wil...",Sports > Camping & Hiking > Camp Sleeping Gear...,0 has no attribute name,,New emergency sleeping bag thermal insulation ...,"Dear customers:\nWelcome to our store, you wil...",
1068,38982,only_text,6176149778dc188427553c38,Carburetor Gaskets Kit For RY3714 & RY3716 Cha...,Condition: New\nBrand: Unbranded\nType: Carb K...,,,,,,...,0.0,Carburetor Gaskets Kit For RY3714 & RY3716 Cha...,0\n0\n0,Condition: New\nBrand: Unbranded\nType: Carb K...,Home & Garden > Garden Supplies > Garden Carts,0 has no attribute name,,Carburetor Gaskets Kit For RY3714 & RY3716 Cha...,Condition: New\nBrand: Unbranded\nType: Carb K...,
1069,39384,only_text,604e464a5b9ef6005771cc22,Night to remember standard 6.5in napkins (10 p...,Its ladies night any night with Night to Remem...,,,,,,...,0.0,Night to remember standard 6.5in napkins (10 p...,0\n0\n0,Its ladies night any night with Night to Remem...,"Home & Garden > Kitchen,Dining & Bar > Disposa...",0 has no attribute name,,Night to remember standard 6.5in napkins (10 p...,Its ladies night any night with Night to Remem...,
1070,41946,text_and_img,5ff4261a407f5715cc56a273,Geoffrey Art Handmade Name Necklace Jewelry,Geoffrey Art Handmade Name Necklace Jewelry Ch...,https://canary.contestimg.wish.com/api/webimag...,,,,,...,0.0,Geoffrey Art Handmade Name Necklace Jewelry,0\n0\n0,Geoffrey Art Handmade Name Necklace Jewelry Ch...,Jewelry & Accessories > Necklaces & Pendants >...,0 has no attribute name,,Geoffrey Art Handmade Name Necklace Jewelry,Geoffrey Art Handmade Name Necklace Jewelry Ch...,https://canary.contestimg.wish.com/api/webimag...
1071,42001,only_text,618f60c807486a158cdc5a1d,Plaid Letter Leather Phone Protective Case for...,100% Brand New and High Quality\n\nPackage inc...,,,,,,...,0.0,Plaid Letter Leather Phone Protective Case for...,0\n0\n0,100% Brand New and High Quality\n\nPackage inc...,Cellphones & Telecommunications > Phone Bags &...,0 has no attribute name,,Plaid Letter Leather Phone Protective Case for...,100% Brand New and High Quality\n\nPackage inc...,


In [69]:
df_correct = pd.DataFrame(correct)

In [70]:
df_correct['final_output'].apply(lambda x: len(x) == 0).mean()

0.0

In [71]:
df_error_fixed_nonempty['final_output'] = df_error_fixed_nonempty['final_output_corrected']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_error_fixed_nonempty['final_output'] = df_error_fixed_nonempty['final_output_corrected']


In [72]:
df_correct = pd.concat([df_correct, df_error_fixed_nonempty])

In [73]:
len(df_correct) / len(df_original)

0.9974138145406619

In [74]:
len(df_correct), len(set(df_correct.label_ordering))

(240658, 240658)

In [75]:
df_error_fixed_empty['rater3_no_attributes'].all()

True

In [77]:
len(df_error_fixed_empty)

15

In [78]:
set(df_error_fixed_empty.file_name)

{'product_attribution_just_032723_invalid_units_fixed.csv',
 'product_attribution_missed_032423_just_032723_invalid_units_fixed.csv',
 'product_attribution_till_032423_invalid_units_fixed.csv'}

In [79]:
set(df_correct)

{'category_path',
 'explanation',
 'file_name',
 'final_output',
 'final_output_corrected',
 'l2_category',
 'label_ordering',
 'main_image_url',
 'main_image_url_original2',
 'no_attributes',
 'product_category',
 'product_description',
 'product_description_original',
 'product_description_original2',
 'product_id',
 'rater1_no_attributes',
 'rater1_output',
 'rater1_unlisted_value',
 'rater2_no_attributes',
 'rater2_output',
 'rater2_unlisted_value',
 'rater3_no_attributes',
 'rater3_output',
 'rater3_unlisted_value',
 'sample_method',
 'title',
 'title_original',
 'title_original2',
 'unlisted_value'}

In [80]:
df_correct = df_correct[['label_ordering', 'sample_method', 'product_id', 'title', 'product_description', 'main_image_url', 'final_output']]

In [81]:
len(df_correct)

240658

In [82]:
df_correct = df_correct.merge(df_original[['label_ordering', 'category_path']])

In [83]:
len(df_correct)

240658

In [84]:
df_correct.sample(2).to_dict('records')

[{'label_ordering': 112800,
  'sample_method': 'only_text',
  'product_id': '60e55182d1df0820c446610f',
  'title': '  Practical Insect Net Breathable Folding Insect Repellent Hanging Tent Lightweight   for Picnic  ',
  'product_description': 'Specifications: \nWhat the fine mesh can offer is that can effectively prevent mosquito and insect from biting. Foldable storage and lightweight design make it easy to carry.\nBecause of the fine mesh, you can be protected from mosquito and insect from biting. Foldable storage and lightweight design make it easy to carry.\nYou can be protected from mosquito and insect from biting with the protection of the fine mesh net. This product is easy to carry with foldable storage and lightweight design.\nIt is easy to carry with foldable storage and lightweight design. It can effectively prevent mosquito and insect from biting since it has fine mesh.\nWith foldable storage and lightweight design, it is easy to carry. With the protection of the fine mesh n

In [85]:
df_correct.to_csv('appen/output_batch_correct_v10/product_attribution_till_041023_valid_units.csv', index=False)

In [86]:
len(set(df_correct.label_ordering))

240658

In [87]:
len(set(df_original.label_ordering))

241282

In [88]:
len(set(df_original.label_ordering)) - len(set(df_correct.label_ordering))

624